1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41F0000000000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 13; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[Y]] to i64 14; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP5]], [[TMP6]] 15; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 16; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP7]], 32 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP8]] 19; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0 20; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP4]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP4]], [[TMP19]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[TMP4]], [[TMP19]] 29; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP12]], i32 [[TMP20]], i32 [[TMP21]] 30; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 31; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[X:%.*]] to i64 32; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 33; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 34; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 35; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 36; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[Y]] 37; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[X]], [[TMP29]] 38; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[Y]] 39; CHECK-NEXT: [[TMP32:%.*]] = icmp uge i32 [[X]], [[TMP29]] 40; CHECK-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]] 41; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP28]], 1 42; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP28]], 1 43; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP28]] 44; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]] 45; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]] 46; CHECK-NEXT: ret void 47; 48; GCN-LABEL: udiv_i32: 49; GCN: ; %bb.0: 50; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 51; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 52; GCN-NEXT: s_mov_b32 s7, 0xf000 53; GCN-NEXT: s_mov_b32 s6, -1 54; GCN-NEXT: s_waitcnt lgkmcnt(0) 55; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 56; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 57; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 58; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 59; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 60; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 61; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 62; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 63; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 64; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 65; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 66; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 67; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 68; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 69; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 70; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 71; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 72; GCN-NEXT: v_sub_i32_e32 v4, vcc, s8, v1 73; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s8, v1 74; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 75; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc 76; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 77; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 78; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 79; GCN-NEXT: s_endpgm 80 %r = udiv i32 %x, %y 81 store i32 %r, i32 addrspace(1)* %out 82 ret void 83} 84 85define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 86; CHECK-LABEL: @urem_i32( 87; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 88; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 89; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41F0000000000000 90; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 91; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 92; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[Y]] to i64 93; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP5]], [[TMP6]] 94; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 95; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP7]], 32 96; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 97; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP8]] 98; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0 99; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] 100; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 101; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP4]] to i64 102; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 103; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 104; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 105; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 106; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP4]], [[TMP19]] 107; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[TMP4]], [[TMP19]] 108; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP12]], i32 [[TMP20]], i32 [[TMP21]] 109; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 110; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[X:%.*]] to i64 111; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 112; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 113; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 114; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 115; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[Y]] 116; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[X]], [[TMP29]] 117; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[Y]] 118; CHECK-NEXT: [[TMP32:%.*]] = icmp uge i32 [[X]], [[TMP29]] 119; CHECK-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]] 120; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[Y]] 121; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP30]], [[Y]] 122; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP30]] 123; CHECK-NEXT: [[TMP37:%.*]] = select i1 [[TMP32]], i32 [[TMP36]], i32 [[TMP35]] 124; CHECK-NEXT: store i32 [[TMP37]], i32 addrspace(1)* [[OUT:%.*]] 125; CHECK-NEXT: ret void 126; 127; GCN-LABEL: urem_i32: 128; GCN: ; %bb.0: 129; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 130; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 131; GCN-NEXT: s_mov_b32 s7, 0xf000 132; GCN-NEXT: s_mov_b32 s6, -1 133; GCN-NEXT: s_waitcnt lgkmcnt(0) 134; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 135; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 136; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 137; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 138; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 139; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 140; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 141; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 142; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 143; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 144; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 145; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 146; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 147; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 148; GCN-NEXT: v_mul_lo_u32 v0, v0, s9 149; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v0 150; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s8, v0 151; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 152; GCN-NEXT: v_add_i32_e32 v2, vcc, s9, v1 153; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s9, v1 154; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 155; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 156; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[2:3] 157; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 158; GCN-NEXT: s_endpgm 159 %r = urem i32 %x, %y 160 store i32 %r, i32 addrspace(1)* %out 161 ret void 162} 163 164define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 165; CHECK-LABEL: @sdiv_i32( 166; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 167; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 168; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 169; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 170; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 171; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 172; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 173; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 174; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 175; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41F0000000000000 176; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 177; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 178; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP7]] to i64 179; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP12]], [[TMP13]] 180; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 181; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 182; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 183; CHECK-NEXT: [[TMP18:%.*]] = sub i32 0, [[TMP15]] 184; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0 185; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 [[TMP15]] 186; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 187; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP11]] to i64 188; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 189; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 190; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 191; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 192; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP11]], [[TMP26]] 193; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP11]], [[TMP26]] 194; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP19]], i32 [[TMP27]], i32 [[TMP28]] 195; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64 196; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP6]] to i64 197; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP30]], [[TMP31]] 198; CHECK-NEXT: [[TMP33:%.*]] = trunc i64 [[TMP32]] to i32 199; CHECK-NEXT: [[TMP34:%.*]] = lshr i64 [[TMP32]], 32 200; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32 201; CHECK-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], [[TMP7]] 202; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP6]], [[TMP36]] 203; CHECK-NEXT: [[TMP38:%.*]] = icmp uge i32 [[TMP37]], [[TMP7]] 204; CHECK-NEXT: [[TMP39:%.*]] = icmp uge i32 [[TMP6]], [[TMP36]] 205; CHECK-NEXT: [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]] 206; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP35]], 1 207; CHECK-NEXT: [[TMP42:%.*]] = sub i32 [[TMP35]], 1 208; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP40]], i32 [[TMP41]], i32 [[TMP35]] 209; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP39]], i32 [[TMP43]], i32 [[TMP42]] 210; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP44]], [[TMP3]] 211; CHECK-NEXT: [[TMP46:%.*]] = sub i32 [[TMP45]], [[TMP3]] 212; CHECK-NEXT: store i32 [[TMP46]], i32 addrspace(1)* [[OUT:%.*]] 213; CHECK-NEXT: ret void 214; 215; GCN-LABEL: sdiv_i32: 216; GCN: ; %bb.0: 217; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 218; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 219; GCN-NEXT: s_mov_b32 s7, 0xf000 220; GCN-NEXT: s_mov_b32 s6, -1 221; GCN-NEXT: s_waitcnt lgkmcnt(0) 222; GCN-NEXT: s_ashr_i32 s8, s3, 31 223; GCN-NEXT: s_add_i32 s3, s3, s8 224; GCN-NEXT: s_xor_b32 s9, s3, s8 225; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 226; GCN-NEXT: s_ashr_i32 s3, s2, 31 227; GCN-NEXT: s_add_i32 s2, s2, s3 228; GCN-NEXT: s_xor_b32 s2, s2, s3 229; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 230; GCN-NEXT: s_xor_b32 s3, s3, s8 231; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 232; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 233; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 234; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 235; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 236; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 237; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 238; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 239; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 240; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 241; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 242; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 243; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 244; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 245; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 246; GCN-NEXT: v_sub_i32_e32 v4, vcc, s2, v1 247; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s2, v1 248; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 249; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc 250; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 251; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 252; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 253; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 254; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 255; GCN-NEXT: s_endpgm 256 %r = sdiv i32 %x, %y 257 store i32 %r, i32 addrspace(1)* %out 258 ret void 259} 260 261define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 262; CHECK-LABEL: @srem_i32( 263; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 264; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 265; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 266; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 267; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 268; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 269; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 270; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 271; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41F0000000000000 272; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 273; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 274; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP6]] to i64 275; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP11]], [[TMP12]] 276; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 277; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP13]], 32 278; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 279; CHECK-NEXT: [[TMP17:%.*]] = sub i32 0, [[TMP14]] 280; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0 281; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP17]], i32 [[TMP14]] 282; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 283; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64 284; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 285; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 286; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 287; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 288; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP10]], [[TMP25]] 289; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP10]], [[TMP25]] 290; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP18]], i32 [[TMP26]], i32 [[TMP27]] 291; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64 292; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP5]] to i64 293; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[TMP29]], [[TMP30]] 294; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 295; CHECK-NEXT: [[TMP33:%.*]] = lshr i64 [[TMP31]], 32 296; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32 297; CHECK-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], [[TMP6]] 298; CHECK-NEXT: [[TMP36:%.*]] = sub i32 [[TMP5]], [[TMP35]] 299; CHECK-NEXT: [[TMP37:%.*]] = icmp uge i32 [[TMP36]], [[TMP6]] 300; CHECK-NEXT: [[TMP38:%.*]] = icmp uge i32 [[TMP5]], [[TMP35]] 301; CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] 302; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP36]], [[TMP6]] 303; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP36]], [[TMP6]] 304; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP39]], i32 [[TMP40]], i32 [[TMP36]] 305; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP38]], i32 [[TMP42]], i32 [[TMP41]] 306; CHECK-NEXT: [[TMP44:%.*]] = xor i32 [[TMP43]], [[TMP1]] 307; CHECK-NEXT: [[TMP45:%.*]] = sub i32 [[TMP44]], [[TMP1]] 308; CHECK-NEXT: store i32 [[TMP45]], i32 addrspace(1)* [[OUT:%.*]] 309; CHECK-NEXT: ret void 310; 311; GCN-LABEL: srem_i32: 312; GCN: ; %bb.0: 313; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 314; GCN-NEXT: s_mov_b32 s7, 0xf000 315; GCN-NEXT: s_mov_b32 s6, -1 316; GCN-NEXT: s_waitcnt lgkmcnt(0) 317; GCN-NEXT: s_ashr_i32 s2, s5, 31 318; GCN-NEXT: s_add_i32 s3, s5, s2 319; GCN-NEXT: s_xor_b32 s10, s3, s2 320; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 321; GCN-NEXT: s_ashr_i32 s8, s4, 31 322; GCN-NEXT: s_add_i32 s4, s4, s8 323; GCN-NEXT: s_xor_b32 s9, s4, s8 324; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 325; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 326; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 327; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 328; GCN-NEXT: v_mul_lo_u32 v1, v0, s10 329; GCN-NEXT: v_mul_hi_u32 v2, v0, s10 330; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 331; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 332; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 333; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 334; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 335; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 336; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 337; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 338; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 339; GCN-NEXT: v_sub_i32_e32 v1, vcc, s9, v0 340; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v0 341; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v1 342; GCN-NEXT: v_add_i32_e32 v2, vcc, s10, v1 343; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v1 344; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 345; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 346; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[2:3] 347; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 348; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 349; GCN-NEXT: s_waitcnt lgkmcnt(0) 350; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 351; GCN-NEXT: s_endpgm 352 %r = srem i32 %x, %y 353 store i32 %r, i32 addrspace(1)* %out 354 ret void 355} 356 357define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 358; CHECK-LABEL: @udiv_i16( 359; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 360; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 361; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 362; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 363; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 364; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 365; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 366; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 367; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 368; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 369; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 370; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 371; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 372; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 373; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 374; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 375; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 376; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]] 377; CHECK-NEXT: ret void 378; 379; GCN-LABEL: udiv_i16: 380; GCN: ; %bb.0: 381; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 382; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 383; GCN-NEXT: s_waitcnt lgkmcnt(0) 384; GCN-NEXT: s_lshr_b32 s3, s2, 16 385; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 386; GCN-NEXT: s_and_b32 s2, s2, 0xffff 387; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 388; GCN-NEXT: s_mov_b32 s3, 0xf000 389; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 390; GCN-NEXT: s_mov_b32 s2, -1 391; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 392; GCN-NEXT: v_trunc_f32_e32 v2, v2 393; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 394; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 395; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 396; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 397; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 398; GCN-NEXT: s_endpgm 399 %r = udiv i16 %x, %y 400 store i16 %r, i16 addrspace(1)* %out 401 ret void 402} 403 404define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 405; CHECK-LABEL: @urem_i16( 406; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 407; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 408; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 409; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 410; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 411; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 412; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 413; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 414; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 415; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 416; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 417; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 418; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 419; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 420; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 421; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 422; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 423; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 424; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 425; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]] 426; CHECK-NEXT: ret void 427; 428; GCN-LABEL: urem_i16: 429; GCN: ; %bb.0: 430; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 431; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 432; GCN-NEXT: s_waitcnt lgkmcnt(0) 433; GCN-NEXT: s_lshr_b32 s2, s4, 16 434; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 435; GCN-NEXT: s_and_b32 s3, s4, 0xffff 436; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 437; GCN-NEXT: s_mov_b32 s3, 0xf000 438; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 439; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 440; GCN-NEXT: v_trunc_f32_e32 v2, v2 441; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 442; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 443; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 444; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 445; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 446; GCN-NEXT: s_mov_b32 s2, -1 447; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 448; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 449; GCN-NEXT: s_endpgm 450 %r = urem i16 %x, %y 451 store i16 %r, i16 addrspace(1)* %out 452 ret void 453} 454 455define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 456; CHECK-LABEL: @sdiv_i16( 457; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 458; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 459; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 461; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 462; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 463; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 464; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 465; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 467; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 468; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 469; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 470; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 471; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 472; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 473; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 474; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 475; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 476; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 477; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 478; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]] 479; CHECK-NEXT: ret void 480; 481; GCN-LABEL: sdiv_i16: 482; GCN: ; %bb.0: 483; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 484; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 485; GCN-NEXT: s_mov_b32 s7, 0xf000 486; GCN-NEXT: s_mov_b32 s6, -1 487; GCN-NEXT: s_waitcnt lgkmcnt(0) 488; GCN-NEXT: s_ashr_i32 s1, s0, 16 489; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 490; GCN-NEXT: s_sext_i32_i16 s0, s0 491; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 492; GCN-NEXT: s_xor_b32 s0, s0, s1 493; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 494; GCN-NEXT: s_ashr_i32 s0, s0, 30 495; GCN-NEXT: s_or_b32 s0, s0, 1 496; GCN-NEXT: v_mov_b32_e32 v3, s0 497; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 498; GCN-NEXT: v_trunc_f32_e32 v2, v2 499; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 500; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 501; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 502; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 503; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 504; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 505; GCN-NEXT: s_endpgm 506 %r = sdiv i16 %x, %y 507 store i16 %r, i16 addrspace(1)* %out 508 ret void 509} 510 511define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 512; CHECK-LABEL: @srem_i16( 513; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 514; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 515; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 516; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 517; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 518; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 519; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 520; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 521; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 522; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 523; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 524; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 525; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 526; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 527; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 528; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 529; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 530; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 531; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 532; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 533; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 534; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 535; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 536; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]] 537; CHECK-NEXT: ret void 538; 539; GCN-LABEL: srem_i16: 540; GCN: ; %bb.0: 541; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 542; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 543; GCN-NEXT: s_waitcnt lgkmcnt(0) 544; GCN-NEXT: s_ashr_i32 s2, s4, 16 545; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 546; GCN-NEXT: s_sext_i32_i16 s3, s4 547; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 548; GCN-NEXT: s_xor_b32 s3, s3, s2 549; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 550; GCN-NEXT: s_ashr_i32 s3, s3, 30 551; GCN-NEXT: s_or_b32 s3, s3, 1 552; GCN-NEXT: v_mov_b32_e32 v3, s3 553; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 554; GCN-NEXT: v_trunc_f32_e32 v2, v2 555; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 556; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 557; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 558; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 559; GCN-NEXT: s_mov_b32 s3, 0xf000 560; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 561; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 562; GCN-NEXT: s_mov_b32 s2, -1 563; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 564; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 565; GCN-NEXT: s_endpgm 566 %r = srem i16 %x, %y 567 store i16 %r, i16 addrspace(1)* %out 568 ret void 569} 570 571define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 572; CHECK-LABEL: @udiv_i8( 573; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 574; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 575; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 576; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 577; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 578; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 579; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 580; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 581; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 582; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 583; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 584; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 585; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 586; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 587; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 588; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 589; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 590; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]] 591; CHECK-NEXT: ret void 592; 593; GCN-LABEL: udiv_i8: 594; GCN: ; %bb.0: 595; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 596; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 597; GCN-NEXT: s_mov_b32 s7, 0xf000 598; GCN-NEXT: s_mov_b32 s6, -1 599; GCN-NEXT: s_waitcnt lgkmcnt(0) 600; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 601; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 602; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 603; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 604; GCN-NEXT: v_trunc_f32_e32 v1, v1 605; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 606; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 607; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 608; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 609; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 610; GCN-NEXT: s_endpgm 611 %r = udiv i8 %x, %y 612 store i8 %r, i8 addrspace(1)* %out 613 ret void 614} 615 616define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 617; CHECK-LABEL: @urem_i8( 618; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 619; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 620; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 621; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 622; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 623; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 624; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 625; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 626; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 627; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 628; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 629; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 630; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 631; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 632; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 633; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 634; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 635; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 636; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 637; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]] 638; CHECK-NEXT: ret void 639; 640; GCN-LABEL: urem_i8: 641; GCN: ; %bb.0: 642; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 643; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 644; GCN-NEXT: s_mov_b32 s3, 0xf000 645; GCN-NEXT: s_waitcnt lgkmcnt(0) 646; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 647; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 648; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 649; GCN-NEXT: s_lshr_b32 s2, s4, 8 650; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 651; GCN-NEXT: v_trunc_f32_e32 v1, v1 652; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 653; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 654; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 655; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 656; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 657; GCN-NEXT: s_mov_b32 s2, -1 658; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 659; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 660; GCN-NEXT: s_endpgm 661 %r = urem i8 %x, %y 662 store i8 %r, i8 addrspace(1)* %out 663 ret void 664} 665 666define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 667; CHECK-LABEL: @sdiv_i8( 668; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 669; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 670; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 671; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 672; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 673; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 674; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 675; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 676; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 677; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 678; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 679; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 680; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 681; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 682; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 683; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 684; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 685; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 686; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 687; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 688; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 689; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]] 690; CHECK-NEXT: ret void 691; 692; GCN-LABEL: sdiv_i8: 693; GCN: ; %bb.0: 694; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 695; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 696; GCN-NEXT: s_mov_b32 s7, 0xf000 697; GCN-NEXT: s_mov_b32 s6, -1 698; GCN-NEXT: s_waitcnt lgkmcnt(0) 699; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 700; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 701; GCN-NEXT: s_sext_i32_i8 s0, s0 702; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 703; GCN-NEXT: s_xor_b32 s0, s0, s1 704; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 705; GCN-NEXT: s_ashr_i32 s0, s0, 30 706; GCN-NEXT: s_or_b32 s0, s0, 1 707; GCN-NEXT: v_mov_b32_e32 v3, s0 708; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 709; GCN-NEXT: v_trunc_f32_e32 v2, v2 710; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 711; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 712; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 713; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 714; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 715; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 716; GCN-NEXT: s_endpgm 717 %r = sdiv i8 %x, %y 718 store i8 %r, i8 addrspace(1)* %out 719 ret void 720} 721 722define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 723; CHECK-LABEL: @srem_i8( 724; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 725; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 726; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 727; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 728; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 729; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 730; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 731; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 732; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 733; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 734; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 735; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 736; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 737; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 738; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 739; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 740; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 741; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 742; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 743; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 744; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 745; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 746; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 747; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]] 748; CHECK-NEXT: ret void 749; 750; GCN-LABEL: srem_i8: 751; GCN: ; %bb.0: 752; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 753; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 754; GCN-NEXT: s_mov_b32 s7, 0xf000 755; GCN-NEXT: s_mov_b32 s6, -1 756; GCN-NEXT: s_waitcnt lgkmcnt(0) 757; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 758; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 759; GCN-NEXT: s_sext_i32_i8 s3, s0 760; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 761; GCN-NEXT: s_xor_b32 s1, s3, s1 762; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 763; GCN-NEXT: s_ashr_i32 s1, s1, 30 764; GCN-NEXT: s_or_b32 s1, s1, 1 765; GCN-NEXT: v_mov_b32_e32 v3, s1 766; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 767; GCN-NEXT: v_trunc_f32_e32 v2, v2 768; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 769; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 770; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 771; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 772; GCN-NEXT: s_lshr_b32 s2, s0, 8 773; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 774; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 775; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 776; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 777; GCN-NEXT: s_endpgm 778 %r = srem i8 %x, %y 779 store i8 %r, i8 addrspace(1)* %out 780 ret void 781} 782 783define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 784; CHECK-LABEL: @udiv_v4i32( 785; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 786; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 787; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 788; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 789; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000 790; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 791; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 792; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64 793; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 794; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 795; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 796; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 797; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP10]] 798; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0 799; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]] 800; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 801; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 802; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 803; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 804; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 805; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 806; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]] 807; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]] 808; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]] 809; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 810; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP1]] to i64 811; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]] 812; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 813; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP27]], 32 814; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32 815; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]] 816; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]] 817; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]] 818; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]] 819; CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]] 820; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP30]], 1 821; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP30]], 1 822; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP30]] 823; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]] 824; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> undef, i32 [[TMP39]], i64 0 825; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[X]], i64 1 826; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[Y]], i64 1 827; CHECK-NEXT: [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float 828; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]]) 829; CHECK-NEXT: [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000 830; CHECK-NEXT: [[TMP46:%.*]] = fptoui float [[TMP45]] to i32 831; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP46]] to i64 832; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP42]] to i64 833; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]] 834; CHECK-NEXT: [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32 835; CHECK-NEXT: [[TMP51:%.*]] = lshr i64 [[TMP49]], 32 836; CHECK-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32 837; CHECK-NEXT: [[TMP53:%.*]] = sub i32 0, [[TMP50]] 838; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0 839; CHECK-NEXT: [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]] 840; CHECK-NEXT: [[TMP56:%.*]] = zext i32 [[TMP55]] to i64 841; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP46]] to i64 842; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]] 843; CHECK-NEXT: [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32 844; CHECK-NEXT: [[TMP60:%.*]] = lshr i64 [[TMP58]], 32 845; CHECK-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32 846; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]] 847; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]] 848; CHECK-NEXT: [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]] 849; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP64]] to i64 850; CHECK-NEXT: [[TMP66:%.*]] = zext i32 [[TMP41]] to i64 851; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]] 852; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 853; CHECK-NEXT: [[TMP69:%.*]] = lshr i64 [[TMP67]], 32 854; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32 855; CHECK-NEXT: [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]] 856; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]] 857; CHECK-NEXT: [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]] 858; CHECK-NEXT: [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]] 859; CHECK-NEXT: [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]] 860; CHECK-NEXT: [[TMP76:%.*]] = add i32 [[TMP70]], 1 861; CHECK-NEXT: [[TMP77:%.*]] = sub i32 [[TMP70]], 1 862; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP70]] 863; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] 864; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP79]], i64 1 865; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i32> [[X]], i64 2 866; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i32> [[Y]], i64 2 867; CHECK-NEXT: [[TMP83:%.*]] = uitofp i32 [[TMP82]] to float 868; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP83]]) 869; CHECK-NEXT: [[TMP85:%.*]] = fmul fast float [[TMP84]], 0x41F0000000000000 870; CHECK-NEXT: [[TMP86:%.*]] = fptoui float [[TMP85]] to i32 871; CHECK-NEXT: [[TMP87:%.*]] = zext i32 [[TMP86]] to i64 872; CHECK-NEXT: [[TMP88:%.*]] = zext i32 [[TMP82]] to i64 873; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP87]], [[TMP88]] 874; CHECK-NEXT: [[TMP90:%.*]] = trunc i64 [[TMP89]] to i32 875; CHECK-NEXT: [[TMP91:%.*]] = lshr i64 [[TMP89]], 32 876; CHECK-NEXT: [[TMP92:%.*]] = trunc i64 [[TMP91]] to i32 877; CHECK-NEXT: [[TMP93:%.*]] = sub i32 0, [[TMP90]] 878; CHECK-NEXT: [[TMP94:%.*]] = icmp eq i32 [[TMP92]], 0 879; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP93]], i32 [[TMP90]] 880; CHECK-NEXT: [[TMP96:%.*]] = zext i32 [[TMP95]] to i64 881; CHECK-NEXT: [[TMP97:%.*]] = zext i32 [[TMP86]] to i64 882; CHECK-NEXT: [[TMP98:%.*]] = mul i64 [[TMP96]], [[TMP97]] 883; CHECK-NEXT: [[TMP99:%.*]] = trunc i64 [[TMP98]] to i32 884; CHECK-NEXT: [[TMP100:%.*]] = lshr i64 [[TMP98]], 32 885; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 886; CHECK-NEXT: [[TMP102:%.*]] = add i32 [[TMP86]], [[TMP101]] 887; CHECK-NEXT: [[TMP103:%.*]] = sub i32 [[TMP86]], [[TMP101]] 888; CHECK-NEXT: [[TMP104:%.*]] = select i1 [[TMP94]], i32 [[TMP102]], i32 [[TMP103]] 889; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP104]] to i64 890; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP81]] to i64 891; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 892; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 893; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 894; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 895; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP82]] 896; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP81]], [[TMP111]] 897; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP82]] 898; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP81]], [[TMP111]] 899; CHECK-NEXT: [[TMP115:%.*]] = and i1 [[TMP113]], [[TMP114]] 900; CHECK-NEXT: [[TMP116:%.*]] = add i32 [[TMP110]], 1 901; CHECK-NEXT: [[TMP117:%.*]] = sub i32 [[TMP110]], 1 902; CHECK-NEXT: [[TMP118:%.*]] = select i1 [[TMP115]], i32 [[TMP116]], i32 [[TMP110]] 903; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP114]], i32 [[TMP118]], i32 [[TMP117]] 904; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP119]], i64 2 905; CHECK-NEXT: [[TMP121:%.*]] = extractelement <4 x i32> [[X]], i64 3 906; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i32> [[Y]], i64 3 907; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 908; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 909; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41F0000000000000 910; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 911; CHECK-NEXT: [[TMP127:%.*]] = zext i32 [[TMP126]] to i64 912; CHECK-NEXT: [[TMP128:%.*]] = zext i32 [[TMP122]] to i64 913; CHECK-NEXT: [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]] 914; CHECK-NEXT: [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32 915; CHECK-NEXT: [[TMP131:%.*]] = lshr i64 [[TMP129]], 32 916; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 917; CHECK-NEXT: [[TMP133:%.*]] = sub i32 0, [[TMP130]] 918; CHECK-NEXT: [[TMP134:%.*]] = icmp eq i32 [[TMP132]], 0 919; CHECK-NEXT: [[TMP135:%.*]] = select i1 [[TMP134]], i32 [[TMP133]], i32 [[TMP130]] 920; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP135]] to i64 921; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP126]] to i64 922; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 923; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 924; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 925; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 926; CHECK-NEXT: [[TMP142:%.*]] = add i32 [[TMP126]], [[TMP141]] 927; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP126]], [[TMP141]] 928; CHECK-NEXT: [[TMP144:%.*]] = select i1 [[TMP134]], i32 [[TMP142]], i32 [[TMP143]] 929; CHECK-NEXT: [[TMP145:%.*]] = zext i32 [[TMP144]] to i64 930; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP121]] to i64 931; CHECK-NEXT: [[TMP147:%.*]] = mul i64 [[TMP145]], [[TMP146]] 932; CHECK-NEXT: [[TMP148:%.*]] = trunc i64 [[TMP147]] to i32 933; CHECK-NEXT: [[TMP149:%.*]] = lshr i64 [[TMP147]], 32 934; CHECK-NEXT: [[TMP150:%.*]] = trunc i64 [[TMP149]] to i32 935; CHECK-NEXT: [[TMP151:%.*]] = mul i32 [[TMP150]], [[TMP122]] 936; CHECK-NEXT: [[TMP152:%.*]] = sub i32 [[TMP121]], [[TMP151]] 937; CHECK-NEXT: [[TMP153:%.*]] = icmp uge i32 [[TMP152]], [[TMP122]] 938; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP121]], [[TMP151]] 939; CHECK-NEXT: [[TMP155:%.*]] = and i1 [[TMP153]], [[TMP154]] 940; CHECK-NEXT: [[TMP156:%.*]] = add i32 [[TMP150]], 1 941; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP150]], 1 942; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP150]] 943; CHECK-NEXT: [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]] 944; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3 945; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]] 946; CHECK-NEXT: ret void 947; 948; GCN-LABEL: udiv_v4i32: 949; GCN: ; %bb.0: 950; GCN-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd 951; GCN-NEXT: s_mov_b32 s6, 0x4f800000 952; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 953; GCN-NEXT: s_mov_b32 s19, 0xf000 954; GCN-NEXT: s_mov_b32 s18, -1 955; GCN-NEXT: s_waitcnt lgkmcnt(0) 956; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 957; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 958; GCN-NEXT: v_cvt_f32_u32_e32 v7, s15 959; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 960; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 961; GCN-NEXT: v_mul_f32_e32 v0, s6, v0 962; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 963; GCN-NEXT: v_mul_f32_e32 v1, s6, v1 964; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 965; GCN-NEXT: v_mul_hi_u32 v2, v0, s12 966; GCN-NEXT: v_mul_lo_u32 v3, v0, s12 967; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 968; GCN-NEXT: v_mul_lo_u32 v5, v1, s13 969; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 970; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 971; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[0:1] 972; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 973; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v5 974; GCN-NEXT: v_add_i32_e32 v6, vcc, v2, v0 975; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 976; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 977; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 978; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 979; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 980; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 981; GCN-NEXT: v_mul_lo_u32 v3, v0, s12 982; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v0 983; GCN-NEXT: v_sub_i32_e32 v5, vcc, s8, v3 984; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s12, v5 985; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v1 986; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 987; GCN-NEXT: v_cvt_f32_u32_e32 v2, s14 988; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 989; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 990; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s8, v3 991; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 992; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 993; GCN-NEXT: s_and_b64 vcc, s[4:5], s[2:3] 994; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 995; GCN-NEXT: v_mul_f32_e32 v2, s6, v2 996; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 997; GCN-NEXT: v_mul_lo_u32 v3, v1, s13 998; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[2:3] 999; GCN-NEXT: v_mul_hi_u32 v6, v2, s14 1000; GCN-NEXT: v_mul_lo_u32 v5, v2, s14 1001; GCN-NEXT: v_sub_i32_e32 v4, vcc, s9, v3 1002; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v3 1003; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 1004; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v5 1005; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] 1006; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 1007; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 1008; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v1 1009; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1010; GCN-NEXT: v_add_i32_e32 v6, vcc, v3, v2 1011; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 1012; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v7 1013; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 1014; GCN-NEXT: v_mul_hi_u32 v2, v2, s10 1015; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1016; GCN-NEXT: v_mul_f32_e32 v3, s6, v3 1017; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1018; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1019; GCN-NEXT: v_mul_lo_u32 v5, v2, s14 1020; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] 1021; GCN-NEXT: v_mul_hi_u32 v7, v3, s15 1022; GCN-NEXT: v_mul_lo_u32 v6, v3, s15 1023; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v5 1024; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v4 1025; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v7 1026; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 1027; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[2:3] 1028; GCN-NEXT: v_mul_hi_u32 v4, v4, v3 1029; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2 1030; GCN-NEXT: v_add_i32_e32 v7, vcc, v4, v3 1031; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v4, v3 1032; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] 1033; GCN-NEXT: v_mul_hi_u32 v3, v3, s11 1034; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s10, v5 1035; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 1036; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1037; GCN-NEXT: v_mul_lo_u32 v5, v3, s15 1038; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1039; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] 1040; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v5 1041; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v4 1042; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s11, v5 1043; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v3 1044; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 1045; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1046; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1047; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] 1048; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 1049; GCN-NEXT: s_endpgm 1050 %r = udiv <4 x i32> %x, %y 1051 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1052 ret void 1053} 1054 1055define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1056; CHECK-LABEL: @urem_v4i32( 1057; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1058; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1059; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1060; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1061; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000 1062; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1063; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 1064; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64 1065; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 1066; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 1067; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 1068; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1069; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP10]] 1070; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0 1071; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]] 1072; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 1073; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 1074; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1075; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1076; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1077; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1078; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]] 1079; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]] 1080; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]] 1081; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 1082; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP1]] to i64 1083; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]] 1084; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1085; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP27]], 32 1086; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32 1087; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]] 1088; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]] 1089; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]] 1090; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]] 1091; CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]] 1092; CHECK-NEXT: [[TMP36:%.*]] = sub i32 [[TMP32]], [[TMP2]] 1093; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP2]] 1094; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP32]] 1095; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]] 1096; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i32> undef, i32 [[TMP39]], i64 0 1097; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[X]], i64 1 1098; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1099; CHECK-NEXT: [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float 1100; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]]) 1101; CHECK-NEXT: [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000 1102; CHECK-NEXT: [[TMP46:%.*]] = fptoui float [[TMP45]] to i32 1103; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP46]] to i64 1104; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP42]] to i64 1105; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]] 1106; CHECK-NEXT: [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32 1107; CHECK-NEXT: [[TMP51:%.*]] = lshr i64 [[TMP49]], 32 1108; CHECK-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32 1109; CHECK-NEXT: [[TMP53:%.*]] = sub i32 0, [[TMP50]] 1110; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0 1111; CHECK-NEXT: [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]] 1112; CHECK-NEXT: [[TMP56:%.*]] = zext i32 [[TMP55]] to i64 1113; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP46]] to i64 1114; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]] 1115; CHECK-NEXT: [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32 1116; CHECK-NEXT: [[TMP60:%.*]] = lshr i64 [[TMP58]], 32 1117; CHECK-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32 1118; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]] 1119; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]] 1120; CHECK-NEXT: [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]] 1121; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP64]] to i64 1122; CHECK-NEXT: [[TMP66:%.*]] = zext i32 [[TMP41]] to i64 1123; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]] 1124; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 1125; CHECK-NEXT: [[TMP69:%.*]] = lshr i64 [[TMP67]], 32 1126; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32 1127; CHECK-NEXT: [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]] 1128; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]] 1129; CHECK-NEXT: [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]] 1130; CHECK-NEXT: [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]] 1131; CHECK-NEXT: [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]] 1132; CHECK-NEXT: [[TMP76:%.*]] = sub i32 [[TMP72]], [[TMP42]] 1133; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP42]] 1134; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP72]] 1135; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] 1136; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP79]], i64 1 1137; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i32> [[X]], i64 2 1138; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1139; CHECK-NEXT: [[TMP83:%.*]] = uitofp i32 [[TMP82]] to float 1140; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP83]]) 1141; CHECK-NEXT: [[TMP85:%.*]] = fmul fast float [[TMP84]], 0x41F0000000000000 1142; CHECK-NEXT: [[TMP86:%.*]] = fptoui float [[TMP85]] to i32 1143; CHECK-NEXT: [[TMP87:%.*]] = zext i32 [[TMP86]] to i64 1144; CHECK-NEXT: [[TMP88:%.*]] = zext i32 [[TMP82]] to i64 1145; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP87]], [[TMP88]] 1146; CHECK-NEXT: [[TMP90:%.*]] = trunc i64 [[TMP89]] to i32 1147; CHECK-NEXT: [[TMP91:%.*]] = lshr i64 [[TMP89]], 32 1148; CHECK-NEXT: [[TMP92:%.*]] = trunc i64 [[TMP91]] to i32 1149; CHECK-NEXT: [[TMP93:%.*]] = sub i32 0, [[TMP90]] 1150; CHECK-NEXT: [[TMP94:%.*]] = icmp eq i32 [[TMP92]], 0 1151; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP93]], i32 [[TMP90]] 1152; CHECK-NEXT: [[TMP96:%.*]] = zext i32 [[TMP95]] to i64 1153; CHECK-NEXT: [[TMP97:%.*]] = zext i32 [[TMP86]] to i64 1154; CHECK-NEXT: [[TMP98:%.*]] = mul i64 [[TMP96]], [[TMP97]] 1155; CHECK-NEXT: [[TMP99:%.*]] = trunc i64 [[TMP98]] to i32 1156; CHECK-NEXT: [[TMP100:%.*]] = lshr i64 [[TMP98]], 32 1157; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1158; CHECK-NEXT: [[TMP102:%.*]] = add i32 [[TMP86]], [[TMP101]] 1159; CHECK-NEXT: [[TMP103:%.*]] = sub i32 [[TMP86]], [[TMP101]] 1160; CHECK-NEXT: [[TMP104:%.*]] = select i1 [[TMP94]], i32 [[TMP102]], i32 [[TMP103]] 1161; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP104]] to i64 1162; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP81]] to i64 1163; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1164; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1165; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1166; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1167; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP82]] 1168; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP81]], [[TMP111]] 1169; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP82]] 1170; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP81]], [[TMP111]] 1171; CHECK-NEXT: [[TMP115:%.*]] = and i1 [[TMP113]], [[TMP114]] 1172; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP82]] 1173; CHECK-NEXT: [[TMP117:%.*]] = add i32 [[TMP112]], [[TMP82]] 1174; CHECK-NEXT: [[TMP118:%.*]] = select i1 [[TMP115]], i32 [[TMP116]], i32 [[TMP112]] 1175; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP114]], i32 [[TMP118]], i32 [[TMP117]] 1176; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP119]], i64 2 1177; CHECK-NEXT: [[TMP121:%.*]] = extractelement <4 x i32> [[X]], i64 3 1178; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1179; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 1180; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 1181; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41F0000000000000 1182; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 1183; CHECK-NEXT: [[TMP127:%.*]] = zext i32 [[TMP126]] to i64 1184; CHECK-NEXT: [[TMP128:%.*]] = zext i32 [[TMP122]] to i64 1185; CHECK-NEXT: [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]] 1186; CHECK-NEXT: [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32 1187; CHECK-NEXT: [[TMP131:%.*]] = lshr i64 [[TMP129]], 32 1188; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 1189; CHECK-NEXT: [[TMP133:%.*]] = sub i32 0, [[TMP130]] 1190; CHECK-NEXT: [[TMP134:%.*]] = icmp eq i32 [[TMP132]], 0 1191; CHECK-NEXT: [[TMP135:%.*]] = select i1 [[TMP134]], i32 [[TMP133]], i32 [[TMP130]] 1192; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP135]] to i64 1193; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP126]] to i64 1194; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 1195; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 1196; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 1197; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 1198; CHECK-NEXT: [[TMP142:%.*]] = add i32 [[TMP126]], [[TMP141]] 1199; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP126]], [[TMP141]] 1200; CHECK-NEXT: [[TMP144:%.*]] = select i1 [[TMP134]], i32 [[TMP142]], i32 [[TMP143]] 1201; CHECK-NEXT: [[TMP145:%.*]] = zext i32 [[TMP144]] to i64 1202; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP121]] to i64 1203; CHECK-NEXT: [[TMP147:%.*]] = mul i64 [[TMP145]], [[TMP146]] 1204; CHECK-NEXT: [[TMP148:%.*]] = trunc i64 [[TMP147]] to i32 1205; CHECK-NEXT: [[TMP149:%.*]] = lshr i64 [[TMP147]], 32 1206; CHECK-NEXT: [[TMP150:%.*]] = trunc i64 [[TMP149]] to i32 1207; CHECK-NEXT: [[TMP151:%.*]] = mul i32 [[TMP150]], [[TMP122]] 1208; CHECK-NEXT: [[TMP152:%.*]] = sub i32 [[TMP121]], [[TMP151]] 1209; CHECK-NEXT: [[TMP153:%.*]] = icmp uge i32 [[TMP152]], [[TMP122]] 1210; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP121]], [[TMP151]] 1211; CHECK-NEXT: [[TMP155:%.*]] = and i1 [[TMP153]], [[TMP154]] 1212; CHECK-NEXT: [[TMP156:%.*]] = sub i32 [[TMP152]], [[TMP122]] 1213; CHECK-NEXT: [[TMP157:%.*]] = add i32 [[TMP152]], [[TMP122]] 1214; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP155]], i32 [[TMP156]], i32 [[TMP152]] 1215; CHECK-NEXT: [[TMP159:%.*]] = select i1 [[TMP154]], i32 [[TMP158]], i32 [[TMP157]] 1216; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP120]], i32 [[TMP159]], i64 3 1217; CHECK-NEXT: store <4 x i32> [[TMP160]], <4 x i32> addrspace(1)* [[OUT:%.*]] 1218; CHECK-NEXT: ret void 1219; 1220; GCN-LABEL: urem_v4i32: 1221; GCN: ; %bb.0: 1222; GCN-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd 1223; GCN-NEXT: s_mov_b32 s6, 0x4f800000 1224; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 1225; GCN-NEXT: s_mov_b32 s19, 0xf000 1226; GCN-NEXT: s_mov_b32 s18, -1 1227; GCN-NEXT: s_waitcnt lgkmcnt(0) 1228; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1229; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 1230; GCN-NEXT: v_cvt_f32_u32_e32 v7, s15 1231; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1232; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1233; GCN-NEXT: v_mul_f32_e32 v0, s6, v0 1234; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1235; GCN-NEXT: v_mul_f32_e32 v1, s6, v1 1236; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1237; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 1238; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 1239; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 1240; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 1241; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1242; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 1243; GCN-NEXT: v_mul_lo_u32 v3, v1, s13 1244; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 1245; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 1246; GCN-NEXT: v_mul_hi_u32 v2, v1, s13 1247; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1248; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 1249; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 1250; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1251; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 1252; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 1253; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 1254; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v1 1255; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 1256; GCN-NEXT: v_cvt_f32_u32_e32 v2, s14 1257; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 1258; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 1259; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v0 1260; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1261; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], s8, v0 1262; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 1263; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v3 1264; GCN-NEXT: v_mul_f32_e32 v2, s6, v2 1265; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1266; GCN-NEXT: v_add_i32_e32 v4, vcc, s12, v3 1267; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v3 1268; GCN-NEXT: s_and_b64 vcc, s[2:3], s[4:5] 1269; GCN-NEXT: v_mul_lo_u32 v5, v2, s14 1270; GCN-NEXT: v_mul_hi_u32 v6, v2, s14 1271; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 1272; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[4:5] 1273; GCN-NEXT: v_sub_i32_e32 v3, vcc, s9, v1 1274; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v1 1275; GCN-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 1276; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 1277; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 1278; GCN-NEXT: v_mul_hi_u32 v1, v1, v2 1279; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v3 1280; GCN-NEXT: v_add_i32_e32 v4, vcc, s13, v3 1281; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s13, v3 1282; GCN-NEXT: v_add_i32_e32 v6, vcc, v1, v2 1283; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v1, v2 1284; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] 1285; GCN-NEXT: v_mul_hi_u32 v1, v1, s10 1286; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v7 1287; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1288; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1289; GCN-NEXT: v_mul_lo_u32 v5, v1, s14 1290; GCN-NEXT: v_mul_f32_e32 v1, s6, v2 1291; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 1292; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v3, s[2:3] 1293; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v5 1294; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v3 1295; GCN-NEXT: v_mul_lo_u32 v4, v2, s15 1296; GCN-NEXT: v_mul_hi_u32 v6, v2, s15 1297; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 1298; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v6 1299; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[2:3] 1300; GCN-NEXT: v_mul_hi_u32 v4, v4, v2 1301; GCN-NEXT: v_add_i32_e32 v6, vcc, s14, v3 1302; GCN-NEXT: v_add_i32_e32 v7, vcc, v4, v2 1303; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v4, v2 1304; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[2:3] 1305; GCN-NEXT: v_mul_hi_u32 v2, v2, s11 1306; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s10, v5 1307; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s14, v3 1308; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1309; GCN-NEXT: v_mul_lo_u32 v5, v2, s15 1310; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 1311; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] 1312; GCN-NEXT: v_sub_i32_e32 v3, vcc, s11, v5 1313; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s11, v5 1314; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 1315; GCN-NEXT: v_add_i32_e32 v4, vcc, s15, v3 1316; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s15, v3 1317; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1318; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1319; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] 1320; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 1321; GCN-NEXT: s_endpgm 1322 %r = urem <4 x i32> %x, %y 1323 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1324 ret void 1325} 1326 1327define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1328; CHECK-LABEL: @sdiv_v4i32( 1329; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1330; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1331; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1332; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1333; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1334; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1335; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1336; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1337; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1338; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1339; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1340; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41F0000000000000 1341; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1342; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 1343; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64 1344; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 1345; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 1346; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 1347; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1348; CHECK-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP17]] 1349; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0 1350; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP17]] 1351; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 1352; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP13]] to i64 1353; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1354; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1355; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1356; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1357; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP13]], [[TMP28]] 1358; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP13]], [[TMP28]] 1359; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP21]], i32 [[TMP29]], i32 [[TMP30]] 1360; CHECK-NEXT: [[TMP32:%.*]] = zext i32 [[TMP31]] to i64 1361; CHECK-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 1362; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP32]], [[TMP33]] 1363; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32 1364; CHECK-NEXT: [[TMP36:%.*]] = lshr i64 [[TMP34]], 32 1365; CHECK-NEXT: [[TMP37:%.*]] = trunc i64 [[TMP36]] to i32 1366; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP9]] 1367; CHECK-NEXT: [[TMP39:%.*]] = sub i32 [[TMP8]], [[TMP38]] 1368; CHECK-NEXT: [[TMP40:%.*]] = icmp uge i32 [[TMP39]], [[TMP9]] 1369; CHECK-NEXT: [[TMP41:%.*]] = icmp uge i32 [[TMP8]], [[TMP38]] 1370; CHECK-NEXT: [[TMP42:%.*]] = and i1 [[TMP40]], [[TMP41]] 1371; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP37]], 1 1372; CHECK-NEXT: [[TMP44:%.*]] = sub i32 [[TMP37]], 1 1373; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP42]], i32 [[TMP43]], i32 [[TMP37]] 1374; CHECK-NEXT: [[TMP46:%.*]] = select i1 [[TMP41]], i32 [[TMP45]], i32 [[TMP44]] 1375; CHECK-NEXT: [[TMP47:%.*]] = xor i32 [[TMP46]], [[TMP5]] 1376; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP47]], [[TMP5]] 1377; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> undef, i32 [[TMP48]], i64 0 1378; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[X]], i64 1 1379; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1380; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[TMP50]], 31 1381; CHECK-NEXT: [[TMP53:%.*]] = ashr i32 [[TMP51]], 31 1382; CHECK-NEXT: [[TMP54:%.*]] = xor i32 [[TMP52]], [[TMP53]] 1383; CHECK-NEXT: [[TMP55:%.*]] = add i32 [[TMP50]], [[TMP52]] 1384; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP51]], [[TMP53]] 1385; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP52]] 1386; CHECK-NEXT: [[TMP58:%.*]] = xor i32 [[TMP56]], [[TMP53]] 1387; CHECK-NEXT: [[TMP59:%.*]] = uitofp i32 [[TMP58]] to float 1388; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP59]]) 1389; CHECK-NEXT: [[TMP61:%.*]] = fmul fast float [[TMP60]], 0x41F0000000000000 1390; CHECK-NEXT: [[TMP62:%.*]] = fptoui float [[TMP61]] to i32 1391; CHECK-NEXT: [[TMP63:%.*]] = zext i32 [[TMP62]] to i64 1392; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP58]] to i64 1393; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP63]], [[TMP64]] 1394; CHECK-NEXT: [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32 1395; CHECK-NEXT: [[TMP67:%.*]] = lshr i64 [[TMP65]], 32 1396; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 1397; CHECK-NEXT: [[TMP69:%.*]] = sub i32 0, [[TMP66]] 1398; CHECK-NEXT: [[TMP70:%.*]] = icmp eq i32 [[TMP68]], 0 1399; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP66]] 1400; CHECK-NEXT: [[TMP72:%.*]] = zext i32 [[TMP71]] to i64 1401; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP62]] to i64 1402; CHECK-NEXT: [[TMP74:%.*]] = mul i64 [[TMP72]], [[TMP73]] 1403; CHECK-NEXT: [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32 1404; CHECK-NEXT: [[TMP76:%.*]] = lshr i64 [[TMP74]], 32 1405; CHECK-NEXT: [[TMP77:%.*]] = trunc i64 [[TMP76]] to i32 1406; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP62]], [[TMP77]] 1407; CHECK-NEXT: [[TMP79:%.*]] = sub i32 [[TMP62]], [[TMP77]] 1408; CHECK-NEXT: [[TMP80:%.*]] = select i1 [[TMP70]], i32 [[TMP78]], i32 [[TMP79]] 1409; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP80]] to i64 1410; CHECK-NEXT: [[TMP82:%.*]] = zext i32 [[TMP57]] to i64 1411; CHECK-NEXT: [[TMP83:%.*]] = mul i64 [[TMP81]], [[TMP82]] 1412; CHECK-NEXT: [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32 1413; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP83]], 32 1414; CHECK-NEXT: [[TMP86:%.*]] = trunc i64 [[TMP85]] to i32 1415; CHECK-NEXT: [[TMP87:%.*]] = mul i32 [[TMP86]], [[TMP58]] 1416; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP57]], [[TMP87]] 1417; CHECK-NEXT: [[TMP89:%.*]] = icmp uge i32 [[TMP88]], [[TMP58]] 1418; CHECK-NEXT: [[TMP90:%.*]] = icmp uge i32 [[TMP57]], [[TMP87]] 1419; CHECK-NEXT: [[TMP91:%.*]] = and i1 [[TMP89]], [[TMP90]] 1420; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP86]], 1 1421; CHECK-NEXT: [[TMP93:%.*]] = sub i32 [[TMP86]], 1 1422; CHECK-NEXT: [[TMP94:%.*]] = select i1 [[TMP91]], i32 [[TMP92]], i32 [[TMP86]] 1423; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP90]], i32 [[TMP94]], i32 [[TMP93]] 1424; CHECK-NEXT: [[TMP96:%.*]] = xor i32 [[TMP95]], [[TMP54]] 1425; CHECK-NEXT: [[TMP97:%.*]] = sub i32 [[TMP96]], [[TMP54]] 1426; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x i32> [[TMP49]], i32 [[TMP97]], i64 1 1427; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i32> [[X]], i64 2 1428; CHECK-NEXT: [[TMP100:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1429; CHECK-NEXT: [[TMP101:%.*]] = ashr i32 [[TMP99]], 31 1430; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP100]], 31 1431; CHECK-NEXT: [[TMP103:%.*]] = xor i32 [[TMP101]], [[TMP102]] 1432; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP99]], [[TMP101]] 1433; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP100]], [[TMP102]] 1434; CHECK-NEXT: [[TMP106:%.*]] = xor i32 [[TMP104]], [[TMP101]] 1435; CHECK-NEXT: [[TMP107:%.*]] = xor i32 [[TMP105]], [[TMP102]] 1436; CHECK-NEXT: [[TMP108:%.*]] = uitofp i32 [[TMP107]] to float 1437; CHECK-NEXT: [[TMP109:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP108]]) 1438; CHECK-NEXT: [[TMP110:%.*]] = fmul fast float [[TMP109]], 0x41F0000000000000 1439; CHECK-NEXT: [[TMP111:%.*]] = fptoui float [[TMP110]] to i32 1440; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP111]] to i64 1441; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP107]] to i64 1442; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1443; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1444; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1445; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1446; CHECK-NEXT: [[TMP118:%.*]] = sub i32 0, [[TMP115]] 1447; CHECK-NEXT: [[TMP119:%.*]] = icmp eq i32 [[TMP117]], 0 1448; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP119]], i32 [[TMP118]], i32 [[TMP115]] 1449; CHECK-NEXT: [[TMP121:%.*]] = zext i32 [[TMP120]] to i64 1450; CHECK-NEXT: [[TMP122:%.*]] = zext i32 [[TMP111]] to i64 1451; CHECK-NEXT: [[TMP123:%.*]] = mul i64 [[TMP121]], [[TMP122]] 1452; CHECK-NEXT: [[TMP124:%.*]] = trunc i64 [[TMP123]] to i32 1453; CHECK-NEXT: [[TMP125:%.*]] = lshr i64 [[TMP123]], 32 1454; CHECK-NEXT: [[TMP126:%.*]] = trunc i64 [[TMP125]] to i32 1455; CHECK-NEXT: [[TMP127:%.*]] = add i32 [[TMP111]], [[TMP126]] 1456; CHECK-NEXT: [[TMP128:%.*]] = sub i32 [[TMP111]], [[TMP126]] 1457; CHECK-NEXT: [[TMP129:%.*]] = select i1 [[TMP119]], i32 [[TMP127]], i32 [[TMP128]] 1458; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP129]] to i64 1459; CHECK-NEXT: [[TMP131:%.*]] = zext i32 [[TMP106]] to i64 1460; CHECK-NEXT: [[TMP132:%.*]] = mul i64 [[TMP130]], [[TMP131]] 1461; CHECK-NEXT: [[TMP133:%.*]] = trunc i64 [[TMP132]] to i32 1462; CHECK-NEXT: [[TMP134:%.*]] = lshr i64 [[TMP132]], 32 1463; CHECK-NEXT: [[TMP135:%.*]] = trunc i64 [[TMP134]] to i32 1464; CHECK-NEXT: [[TMP136:%.*]] = mul i32 [[TMP135]], [[TMP107]] 1465; CHECK-NEXT: [[TMP137:%.*]] = sub i32 [[TMP106]], [[TMP136]] 1466; CHECK-NEXT: [[TMP138:%.*]] = icmp uge i32 [[TMP137]], [[TMP107]] 1467; CHECK-NEXT: [[TMP139:%.*]] = icmp uge i32 [[TMP106]], [[TMP136]] 1468; CHECK-NEXT: [[TMP140:%.*]] = and i1 [[TMP138]], [[TMP139]] 1469; CHECK-NEXT: [[TMP141:%.*]] = add i32 [[TMP135]], 1 1470; CHECK-NEXT: [[TMP142:%.*]] = sub i32 [[TMP135]], 1 1471; CHECK-NEXT: [[TMP143:%.*]] = select i1 [[TMP140]], i32 [[TMP141]], i32 [[TMP135]] 1472; CHECK-NEXT: [[TMP144:%.*]] = select i1 [[TMP139]], i32 [[TMP143]], i32 [[TMP142]] 1473; CHECK-NEXT: [[TMP145:%.*]] = xor i32 [[TMP144]], [[TMP103]] 1474; CHECK-NEXT: [[TMP146:%.*]] = sub i32 [[TMP145]], [[TMP103]] 1475; CHECK-NEXT: [[TMP147:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP146]], i64 2 1476; CHECK-NEXT: [[TMP148:%.*]] = extractelement <4 x i32> [[X]], i64 3 1477; CHECK-NEXT: [[TMP149:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1478; CHECK-NEXT: [[TMP150:%.*]] = ashr i32 [[TMP148]], 31 1479; CHECK-NEXT: [[TMP151:%.*]] = ashr i32 [[TMP149]], 31 1480; CHECK-NEXT: [[TMP152:%.*]] = xor i32 [[TMP150]], [[TMP151]] 1481; CHECK-NEXT: [[TMP153:%.*]] = add i32 [[TMP148]], [[TMP150]] 1482; CHECK-NEXT: [[TMP154:%.*]] = add i32 [[TMP149]], [[TMP151]] 1483; CHECK-NEXT: [[TMP155:%.*]] = xor i32 [[TMP153]], [[TMP150]] 1484; CHECK-NEXT: [[TMP156:%.*]] = xor i32 [[TMP154]], [[TMP151]] 1485; CHECK-NEXT: [[TMP157:%.*]] = uitofp i32 [[TMP156]] to float 1486; CHECK-NEXT: [[TMP158:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP157]]) 1487; CHECK-NEXT: [[TMP159:%.*]] = fmul fast float [[TMP158]], 0x41F0000000000000 1488; CHECK-NEXT: [[TMP160:%.*]] = fptoui float [[TMP159]] to i32 1489; CHECK-NEXT: [[TMP161:%.*]] = zext i32 [[TMP160]] to i64 1490; CHECK-NEXT: [[TMP162:%.*]] = zext i32 [[TMP156]] to i64 1491; CHECK-NEXT: [[TMP163:%.*]] = mul i64 [[TMP161]], [[TMP162]] 1492; CHECK-NEXT: [[TMP164:%.*]] = trunc i64 [[TMP163]] to i32 1493; CHECK-NEXT: [[TMP165:%.*]] = lshr i64 [[TMP163]], 32 1494; CHECK-NEXT: [[TMP166:%.*]] = trunc i64 [[TMP165]] to i32 1495; CHECK-NEXT: [[TMP167:%.*]] = sub i32 0, [[TMP164]] 1496; CHECK-NEXT: [[TMP168:%.*]] = icmp eq i32 [[TMP166]], 0 1497; CHECK-NEXT: [[TMP169:%.*]] = select i1 [[TMP168]], i32 [[TMP167]], i32 [[TMP164]] 1498; CHECK-NEXT: [[TMP170:%.*]] = zext i32 [[TMP169]] to i64 1499; CHECK-NEXT: [[TMP171:%.*]] = zext i32 [[TMP160]] to i64 1500; CHECK-NEXT: [[TMP172:%.*]] = mul i64 [[TMP170]], [[TMP171]] 1501; CHECK-NEXT: [[TMP173:%.*]] = trunc i64 [[TMP172]] to i32 1502; CHECK-NEXT: [[TMP174:%.*]] = lshr i64 [[TMP172]], 32 1503; CHECK-NEXT: [[TMP175:%.*]] = trunc i64 [[TMP174]] to i32 1504; CHECK-NEXT: [[TMP176:%.*]] = add i32 [[TMP160]], [[TMP175]] 1505; CHECK-NEXT: [[TMP177:%.*]] = sub i32 [[TMP160]], [[TMP175]] 1506; CHECK-NEXT: [[TMP178:%.*]] = select i1 [[TMP168]], i32 [[TMP176]], i32 [[TMP177]] 1507; CHECK-NEXT: [[TMP179:%.*]] = zext i32 [[TMP178]] to i64 1508; CHECK-NEXT: [[TMP180:%.*]] = zext i32 [[TMP155]] to i64 1509; CHECK-NEXT: [[TMP181:%.*]] = mul i64 [[TMP179]], [[TMP180]] 1510; CHECK-NEXT: [[TMP182:%.*]] = trunc i64 [[TMP181]] to i32 1511; CHECK-NEXT: [[TMP183:%.*]] = lshr i64 [[TMP181]], 32 1512; CHECK-NEXT: [[TMP184:%.*]] = trunc i64 [[TMP183]] to i32 1513; CHECK-NEXT: [[TMP185:%.*]] = mul i32 [[TMP184]], [[TMP156]] 1514; CHECK-NEXT: [[TMP186:%.*]] = sub i32 [[TMP155]], [[TMP185]] 1515; CHECK-NEXT: [[TMP187:%.*]] = icmp uge i32 [[TMP186]], [[TMP156]] 1516; CHECK-NEXT: [[TMP188:%.*]] = icmp uge i32 [[TMP155]], [[TMP185]] 1517; CHECK-NEXT: [[TMP189:%.*]] = and i1 [[TMP187]], [[TMP188]] 1518; CHECK-NEXT: [[TMP190:%.*]] = add i32 [[TMP184]], 1 1519; CHECK-NEXT: [[TMP191:%.*]] = sub i32 [[TMP184]], 1 1520; CHECK-NEXT: [[TMP192:%.*]] = select i1 [[TMP189]], i32 [[TMP190]], i32 [[TMP184]] 1521; CHECK-NEXT: [[TMP193:%.*]] = select i1 [[TMP188]], i32 [[TMP192]], i32 [[TMP191]] 1522; CHECK-NEXT: [[TMP194:%.*]] = xor i32 [[TMP193]], [[TMP152]] 1523; CHECK-NEXT: [[TMP195:%.*]] = sub i32 [[TMP194]], [[TMP152]] 1524; CHECK-NEXT: [[TMP196:%.*]] = insertelement <4 x i32> [[TMP147]], i32 [[TMP195]], i64 3 1525; CHECK-NEXT: store <4 x i32> [[TMP196]], <4 x i32> addrspace(1)* [[OUT:%.*]] 1526; CHECK-NEXT: ret void 1527; 1528; GCN-LABEL: sdiv_v4i32: 1529; GCN: ; %bb.0: 1530; GCN-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0xd 1531; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 1532; GCN-NEXT: s_mov_b32 s11, 0xf000 1533; GCN-NEXT: s_mov_b32 s10, -1 1534; GCN-NEXT: s_waitcnt lgkmcnt(0) 1535; GCN-NEXT: s_ashr_i32 s2, s16, 31 1536; GCN-NEXT: s_add_i32 s3, s16, s2 1537; GCN-NEXT: s_xor_b32 s5, s3, s2 1538; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 1539; GCN-NEXT: s_mov_b32 s16, 0x4f800000 1540; GCN-NEXT: s_ashr_i32 s6, s17, 31 1541; GCN-NEXT: s_add_i32 s0, s17, s6 1542; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1543; GCN-NEXT: s_xor_b32 s17, s0, s6 1544; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17 1545; GCN-NEXT: s_ashr_i32 s3, s12, 31 1546; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 1547; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1548; GCN-NEXT: s_add_i32 s4, s12, s3 1549; GCN-NEXT: s_xor_b32 s4, s4, s3 1550; GCN-NEXT: s_xor_b32 s7, s3, s2 1551; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 1552; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 1553; GCN-NEXT: s_ashr_i32 s12, s13, 31 1554; GCN-NEXT: s_add_i32 s13, s13, s12 1555; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 1556; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1557; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1558; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 1559; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v3 1560; GCN-NEXT: s_xor_b32 s13, s13, s12 1561; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 1562; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 1563; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1564; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 1565; GCN-NEXT: v_mul_f32_e32 v1, s16, v2 1566; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1567; GCN-NEXT: v_mul_lo_u32 v2, v0, s5 1568; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 1569; GCN-NEXT: v_mul_hi_u32 v5, v1, s17 1570; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v2 1571; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v4 1572; GCN-NEXT: v_mul_lo_u32 v4, v1, s17 1573; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v2 1574; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 1575; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 1576; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 1577; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] 1578; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 1579; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 1580; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 1581; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 1582; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] 1583; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1584; GCN-NEXT: s_ashr_i32 s5, s18, 31 1585; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] 1586; GCN-NEXT: s_add_i32 s0, s18, s5 1587; GCN-NEXT: s_xor_b32 s4, s12, s6 1588; GCN-NEXT: s_xor_b32 s12, s0, s5 1589; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 1590; GCN-NEXT: v_mul_hi_u32 v1, v1, s13 1591; GCN-NEXT: v_xor_b32_e32 v0, s7, v0 1592; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 1593; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 1594; GCN-NEXT: v_mul_lo_u32 v2, v1, s17 1595; GCN-NEXT: s_ashr_i32 s6, s19, 31 1596; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 1597; GCN-NEXT: v_sub_i32_e32 v3, vcc, s13, v2 1598; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 1599; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 1600; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s13, v2 1601; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v1 1602; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 1603; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1604; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 1605; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] 1606; GCN-NEXT: v_mul_lo_u32 v2, v4, s12 1607; GCN-NEXT: v_mul_hi_u32 v3, v4, s12 1608; GCN-NEXT: s_ashr_i32 s2, s14, 31 1609; GCN-NEXT: s_add_i32 s3, s14, s2 1610; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 1611; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 1612; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1613; GCN-NEXT: v_mul_hi_u32 v2, v2, v4 1614; GCN-NEXT: s_xor_b32 s3, s3, s2 1615; GCN-NEXT: v_xor_b32_e32 v1, s4, v1 1616; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 1617; GCN-NEXT: v_add_i32_e32 v3, vcc, v2, v4 1618; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v2, v4 1619; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1620; GCN-NEXT: s_add_i32 s0, s19, s6 1621; GCN-NEXT: s_xor_b32 s14, s0, s6 1622; GCN-NEXT: v_cvt_f32_u32_e32 v4, s14 1623; GCN-NEXT: v_mul_hi_u32 v2, v2, s3 1624; GCN-NEXT: s_xor_b32 s7, s2, s5 1625; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 1626; GCN-NEXT: v_mul_lo_u32 v3, v2, s12 1627; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 1628; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 1629; GCN-NEXT: v_sub_i32_e32 v5, vcc, s3, v3 1630; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 1631; GCN-NEXT: s_ashr_i32 s12, s15, 31 1632; GCN-NEXT: v_mul_lo_u32 v6, v4, s14 1633; GCN-NEXT: v_mul_hi_u32 v7, v4, s14 1634; GCN-NEXT: s_add_i32 s13, s15, s12 1635; GCN-NEXT: s_xor_b32 s13, s13, s12 1636; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 1637; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 1638; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] 1639; GCN-NEXT: v_mul_hi_u32 v6, v6, v4 1640; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s3, v3 1641; GCN-NEXT: v_add_i32_e32 v5, vcc, -1, v2 1642; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v2 1643; GCN-NEXT: v_add_i32_e32 v7, vcc, v6, v4 1644; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4 1645; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] 1646; GCN-NEXT: v_mul_hi_u32 v4, v4, s13 1647; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1648; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1649; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] 1650; GCN-NEXT: v_mul_lo_u32 v3, v4, s14 1651; GCN-NEXT: v_xor_b32_e32 v2, s7, v2 1652; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 1653; GCN-NEXT: s_xor_b32 s4, s12, s6 1654; GCN-NEXT: v_sub_i32_e32 v5, vcc, s13, v3 1655; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v5 1656; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s13, v3 1657; GCN-NEXT: v_add_i32_e32 v5, vcc, -1, v4 1658; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v4 1659; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1660; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 1661; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] 1662; GCN-NEXT: v_xor_b32_e32 v3, s4, v3 1663; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 1664; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1665; GCN-NEXT: s_endpgm 1666 %r = sdiv <4 x i32> %x, %y 1667 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1668 ret void 1669} 1670 1671define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1672; CHECK-LABEL: @srem_v4i32( 1673; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1674; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1675; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1676; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1677; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 1678; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 1679; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 1680; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 1681; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 1682; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 1683; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41F0000000000000 1684; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 1685; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 1686; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64 1687; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 1688; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 1689; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 1690; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 1691; CHECK-NEXT: [[TMP19:%.*]] = sub i32 0, [[TMP16]] 1692; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0 1693; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP16]] 1694; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 1695; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 1696; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 1697; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 1698; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 1699; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 1700; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP12]], [[TMP27]] 1701; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP12]], [[TMP27]] 1702; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP20]], i32 [[TMP28]], i32 [[TMP29]] 1703; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP30]] to i64 1704; CHECK-NEXT: [[TMP32:%.*]] = zext i32 [[TMP7]] to i64 1705; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP31]], [[TMP32]] 1706; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32 1707; CHECK-NEXT: [[TMP35:%.*]] = lshr i64 [[TMP33]], 32 1708; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 1709; CHECK-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], [[TMP8]] 1710; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP7]], [[TMP37]] 1711; CHECK-NEXT: [[TMP39:%.*]] = icmp uge i32 [[TMP38]], [[TMP8]] 1712; CHECK-NEXT: [[TMP40:%.*]] = icmp uge i32 [[TMP7]], [[TMP37]] 1713; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]] 1714; CHECK-NEXT: [[TMP42:%.*]] = sub i32 [[TMP38]], [[TMP8]] 1715; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP38]], [[TMP8]] 1716; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP41]], i32 [[TMP42]], i32 [[TMP38]] 1717; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP40]], i32 [[TMP44]], i32 [[TMP43]] 1718; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP45]], [[TMP3]] 1719; CHECK-NEXT: [[TMP47:%.*]] = sub i32 [[TMP46]], [[TMP3]] 1720; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i32> undef, i32 [[TMP47]], i64 0 1721; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[X]], i64 1 1722; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1723; CHECK-NEXT: [[TMP51:%.*]] = ashr i32 [[TMP49]], 31 1724; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[TMP50]], 31 1725; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[TMP49]], [[TMP51]] 1726; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP50]], [[TMP52]] 1727; CHECK-NEXT: [[TMP55:%.*]] = xor i32 [[TMP53]], [[TMP51]] 1728; CHECK-NEXT: [[TMP56:%.*]] = xor i32 [[TMP54]], [[TMP52]] 1729; CHECK-NEXT: [[TMP57:%.*]] = uitofp i32 [[TMP56]] to float 1730; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 1731; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP58]], 0x41F0000000000000 1732; CHECK-NEXT: [[TMP60:%.*]] = fptoui float [[TMP59]] to i32 1733; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP60]] to i64 1734; CHECK-NEXT: [[TMP62:%.*]] = zext i32 [[TMP56]] to i64 1735; CHECK-NEXT: [[TMP63:%.*]] = mul i64 [[TMP61]], [[TMP62]] 1736; CHECK-NEXT: [[TMP64:%.*]] = trunc i64 [[TMP63]] to i32 1737; CHECK-NEXT: [[TMP65:%.*]] = lshr i64 [[TMP63]], 32 1738; CHECK-NEXT: [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32 1739; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP64]] 1740; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i32 [[TMP66]], 0 1741; CHECK-NEXT: [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP67]], i32 [[TMP64]] 1742; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i64 1743; CHECK-NEXT: [[TMP71:%.*]] = zext i32 [[TMP60]] to i64 1744; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP70]], [[TMP71]] 1745; CHECK-NEXT: [[TMP73:%.*]] = trunc i64 [[TMP72]] to i32 1746; CHECK-NEXT: [[TMP74:%.*]] = lshr i64 [[TMP72]], 32 1747; CHECK-NEXT: [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32 1748; CHECK-NEXT: [[TMP76:%.*]] = add i32 [[TMP60]], [[TMP75]] 1749; CHECK-NEXT: [[TMP77:%.*]] = sub i32 [[TMP60]], [[TMP75]] 1750; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP68]], i32 [[TMP76]], i32 [[TMP77]] 1751; CHECK-NEXT: [[TMP79:%.*]] = zext i32 [[TMP78]] to i64 1752; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP55]] to i64 1753; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP79]], [[TMP80]] 1754; CHECK-NEXT: [[TMP82:%.*]] = trunc i64 [[TMP81]] to i32 1755; CHECK-NEXT: [[TMP83:%.*]] = lshr i64 [[TMP81]], 32 1756; CHECK-NEXT: [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32 1757; CHECK-NEXT: [[TMP85:%.*]] = mul i32 [[TMP84]], [[TMP56]] 1758; CHECK-NEXT: [[TMP86:%.*]] = sub i32 [[TMP55]], [[TMP85]] 1759; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP56]] 1760; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP55]], [[TMP85]] 1761; CHECK-NEXT: [[TMP89:%.*]] = and i1 [[TMP87]], [[TMP88]] 1762; CHECK-NEXT: [[TMP90:%.*]] = sub i32 [[TMP86]], [[TMP56]] 1763; CHECK-NEXT: [[TMP91:%.*]] = add i32 [[TMP86]], [[TMP56]] 1764; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP89]], i32 [[TMP90]], i32 [[TMP86]] 1765; CHECK-NEXT: [[TMP93:%.*]] = select i1 [[TMP88]], i32 [[TMP92]], i32 [[TMP91]] 1766; CHECK-NEXT: [[TMP94:%.*]] = xor i32 [[TMP93]], [[TMP51]] 1767; CHECK-NEXT: [[TMP95:%.*]] = sub i32 [[TMP94]], [[TMP51]] 1768; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP95]], i64 1 1769; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 2 1770; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1771; CHECK-NEXT: [[TMP99:%.*]] = ashr i32 [[TMP97]], 31 1772; CHECK-NEXT: [[TMP100:%.*]] = ashr i32 [[TMP98]], 31 1773; CHECK-NEXT: [[TMP101:%.*]] = add i32 [[TMP97]], [[TMP99]] 1774; CHECK-NEXT: [[TMP102:%.*]] = add i32 [[TMP98]], [[TMP100]] 1775; CHECK-NEXT: [[TMP103:%.*]] = xor i32 [[TMP101]], [[TMP99]] 1776; CHECK-NEXT: [[TMP104:%.*]] = xor i32 [[TMP102]], [[TMP100]] 1777; CHECK-NEXT: [[TMP105:%.*]] = uitofp i32 [[TMP104]] to float 1778; CHECK-NEXT: [[TMP106:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP105]]) 1779; CHECK-NEXT: [[TMP107:%.*]] = fmul fast float [[TMP106]], 0x41F0000000000000 1780; CHECK-NEXT: [[TMP108:%.*]] = fptoui float [[TMP107]] to i32 1781; CHECK-NEXT: [[TMP109:%.*]] = zext i32 [[TMP108]] to i64 1782; CHECK-NEXT: [[TMP110:%.*]] = zext i32 [[TMP104]] to i64 1783; CHECK-NEXT: [[TMP111:%.*]] = mul i64 [[TMP109]], [[TMP110]] 1784; CHECK-NEXT: [[TMP112:%.*]] = trunc i64 [[TMP111]] to i32 1785; CHECK-NEXT: [[TMP113:%.*]] = lshr i64 [[TMP111]], 32 1786; CHECK-NEXT: [[TMP114:%.*]] = trunc i64 [[TMP113]] to i32 1787; CHECK-NEXT: [[TMP115:%.*]] = sub i32 0, [[TMP112]] 1788; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i32 [[TMP114]], 0 1789; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP116]], i32 [[TMP115]], i32 [[TMP112]] 1790; CHECK-NEXT: [[TMP118:%.*]] = zext i32 [[TMP117]] to i64 1791; CHECK-NEXT: [[TMP119:%.*]] = zext i32 [[TMP108]] to i64 1792; CHECK-NEXT: [[TMP120:%.*]] = mul i64 [[TMP118]], [[TMP119]] 1793; CHECK-NEXT: [[TMP121:%.*]] = trunc i64 [[TMP120]] to i32 1794; CHECK-NEXT: [[TMP122:%.*]] = lshr i64 [[TMP120]], 32 1795; CHECK-NEXT: [[TMP123:%.*]] = trunc i64 [[TMP122]] to i32 1796; CHECK-NEXT: [[TMP124:%.*]] = add i32 [[TMP108]], [[TMP123]] 1797; CHECK-NEXT: [[TMP125:%.*]] = sub i32 [[TMP108]], [[TMP123]] 1798; CHECK-NEXT: [[TMP126:%.*]] = select i1 [[TMP116]], i32 [[TMP124]], i32 [[TMP125]] 1799; CHECK-NEXT: [[TMP127:%.*]] = zext i32 [[TMP126]] to i64 1800; CHECK-NEXT: [[TMP128:%.*]] = zext i32 [[TMP103]] to i64 1801; CHECK-NEXT: [[TMP129:%.*]] = mul i64 [[TMP127]], [[TMP128]] 1802; CHECK-NEXT: [[TMP130:%.*]] = trunc i64 [[TMP129]] to i32 1803; CHECK-NEXT: [[TMP131:%.*]] = lshr i64 [[TMP129]], 32 1804; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 1805; CHECK-NEXT: [[TMP133:%.*]] = mul i32 [[TMP132]], [[TMP104]] 1806; CHECK-NEXT: [[TMP134:%.*]] = sub i32 [[TMP103]], [[TMP133]] 1807; CHECK-NEXT: [[TMP135:%.*]] = icmp uge i32 [[TMP134]], [[TMP104]] 1808; CHECK-NEXT: [[TMP136:%.*]] = icmp uge i32 [[TMP103]], [[TMP133]] 1809; CHECK-NEXT: [[TMP137:%.*]] = and i1 [[TMP135]], [[TMP136]] 1810; CHECK-NEXT: [[TMP138:%.*]] = sub i32 [[TMP134]], [[TMP104]] 1811; CHECK-NEXT: [[TMP139:%.*]] = add i32 [[TMP134]], [[TMP104]] 1812; CHECK-NEXT: [[TMP140:%.*]] = select i1 [[TMP137]], i32 [[TMP138]], i32 [[TMP134]] 1813; CHECK-NEXT: [[TMP141:%.*]] = select i1 [[TMP136]], i32 [[TMP140]], i32 [[TMP139]] 1814; CHECK-NEXT: [[TMP142:%.*]] = xor i32 [[TMP141]], [[TMP99]] 1815; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP142]], [[TMP99]] 1816; CHECK-NEXT: [[TMP144:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP143]], i64 2 1817; CHECK-NEXT: [[TMP145:%.*]] = extractelement <4 x i32> [[X]], i64 3 1818; CHECK-NEXT: [[TMP146:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1819; CHECK-NEXT: [[TMP147:%.*]] = ashr i32 [[TMP145]], 31 1820; CHECK-NEXT: [[TMP148:%.*]] = ashr i32 [[TMP146]], 31 1821; CHECK-NEXT: [[TMP149:%.*]] = add i32 [[TMP145]], [[TMP147]] 1822; CHECK-NEXT: [[TMP150:%.*]] = add i32 [[TMP146]], [[TMP148]] 1823; CHECK-NEXT: [[TMP151:%.*]] = xor i32 [[TMP149]], [[TMP147]] 1824; CHECK-NEXT: [[TMP152:%.*]] = xor i32 [[TMP150]], [[TMP148]] 1825; CHECK-NEXT: [[TMP153:%.*]] = uitofp i32 [[TMP152]] to float 1826; CHECK-NEXT: [[TMP154:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP153]]) 1827; CHECK-NEXT: [[TMP155:%.*]] = fmul fast float [[TMP154]], 0x41F0000000000000 1828; CHECK-NEXT: [[TMP156:%.*]] = fptoui float [[TMP155]] to i32 1829; CHECK-NEXT: [[TMP157:%.*]] = zext i32 [[TMP156]] to i64 1830; CHECK-NEXT: [[TMP158:%.*]] = zext i32 [[TMP152]] to i64 1831; CHECK-NEXT: [[TMP159:%.*]] = mul i64 [[TMP157]], [[TMP158]] 1832; CHECK-NEXT: [[TMP160:%.*]] = trunc i64 [[TMP159]] to i32 1833; CHECK-NEXT: [[TMP161:%.*]] = lshr i64 [[TMP159]], 32 1834; CHECK-NEXT: [[TMP162:%.*]] = trunc i64 [[TMP161]] to i32 1835; CHECK-NEXT: [[TMP163:%.*]] = sub i32 0, [[TMP160]] 1836; CHECK-NEXT: [[TMP164:%.*]] = icmp eq i32 [[TMP162]], 0 1837; CHECK-NEXT: [[TMP165:%.*]] = select i1 [[TMP164]], i32 [[TMP163]], i32 [[TMP160]] 1838; CHECK-NEXT: [[TMP166:%.*]] = zext i32 [[TMP165]] to i64 1839; CHECK-NEXT: [[TMP167:%.*]] = zext i32 [[TMP156]] to i64 1840; CHECK-NEXT: [[TMP168:%.*]] = mul i64 [[TMP166]], [[TMP167]] 1841; CHECK-NEXT: [[TMP169:%.*]] = trunc i64 [[TMP168]] to i32 1842; CHECK-NEXT: [[TMP170:%.*]] = lshr i64 [[TMP168]], 32 1843; CHECK-NEXT: [[TMP171:%.*]] = trunc i64 [[TMP170]] to i32 1844; CHECK-NEXT: [[TMP172:%.*]] = add i32 [[TMP156]], [[TMP171]] 1845; CHECK-NEXT: [[TMP173:%.*]] = sub i32 [[TMP156]], [[TMP171]] 1846; CHECK-NEXT: [[TMP174:%.*]] = select i1 [[TMP164]], i32 [[TMP172]], i32 [[TMP173]] 1847; CHECK-NEXT: [[TMP175:%.*]] = zext i32 [[TMP174]] to i64 1848; CHECK-NEXT: [[TMP176:%.*]] = zext i32 [[TMP151]] to i64 1849; CHECK-NEXT: [[TMP177:%.*]] = mul i64 [[TMP175]], [[TMP176]] 1850; CHECK-NEXT: [[TMP178:%.*]] = trunc i64 [[TMP177]] to i32 1851; CHECK-NEXT: [[TMP179:%.*]] = lshr i64 [[TMP177]], 32 1852; CHECK-NEXT: [[TMP180:%.*]] = trunc i64 [[TMP179]] to i32 1853; CHECK-NEXT: [[TMP181:%.*]] = mul i32 [[TMP180]], [[TMP152]] 1854; CHECK-NEXT: [[TMP182:%.*]] = sub i32 [[TMP151]], [[TMP181]] 1855; CHECK-NEXT: [[TMP183:%.*]] = icmp uge i32 [[TMP182]], [[TMP152]] 1856; CHECK-NEXT: [[TMP184:%.*]] = icmp uge i32 [[TMP151]], [[TMP181]] 1857; CHECK-NEXT: [[TMP185:%.*]] = and i1 [[TMP183]], [[TMP184]] 1858; CHECK-NEXT: [[TMP186:%.*]] = sub i32 [[TMP182]], [[TMP152]] 1859; CHECK-NEXT: [[TMP187:%.*]] = add i32 [[TMP182]], [[TMP152]] 1860; CHECK-NEXT: [[TMP188:%.*]] = select i1 [[TMP185]], i32 [[TMP186]], i32 [[TMP182]] 1861; CHECK-NEXT: [[TMP189:%.*]] = select i1 [[TMP184]], i32 [[TMP188]], i32 [[TMP187]] 1862; CHECK-NEXT: [[TMP190:%.*]] = xor i32 [[TMP189]], [[TMP147]] 1863; CHECK-NEXT: [[TMP191:%.*]] = sub i32 [[TMP190]], [[TMP147]] 1864; CHECK-NEXT: [[TMP192:%.*]] = insertelement <4 x i32> [[TMP144]], i32 [[TMP191]], i64 3 1865; CHECK-NEXT: store <4 x i32> [[TMP192]], <4 x i32> addrspace(1)* [[OUT:%.*]] 1866; CHECK-NEXT: ret void 1867; 1868; GCN-LABEL: srem_v4i32: 1869; GCN: ; %bb.0: 1870; GCN-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0xd 1871; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 1872; GCN-NEXT: s_mov_b32 s11, 0xf000 1873; GCN-NEXT: s_mov_b32 s10, -1 1874; GCN-NEXT: s_waitcnt lgkmcnt(0) 1875; GCN-NEXT: s_ashr_i32 s2, s16, 31 1876; GCN-NEXT: s_add_i32 s3, s16, s2 1877; GCN-NEXT: s_xor_b32 s5, s3, s2 1878; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 1879; GCN-NEXT: s_mov_b32 s16, 0x4f800000 1880; GCN-NEXT: s_ashr_i32 s6, s12, 31 1881; GCN-NEXT: s_ashr_i32 s2, s17, 31 1882; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1883; GCN-NEXT: s_add_i32 s0, s12, s6 1884; GCN-NEXT: s_add_i32 s3, s17, s2 1885; GCN-NEXT: s_xor_b32 s4, s0, s6 1886; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 1887; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1888; GCN-NEXT: s_xor_b32 s17, s3, s2 1889; GCN-NEXT: s_ashr_i32 s7, s13, 31 1890; GCN-NEXT: s_add_i32 s12, s13, s7 1891; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 1892; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 1893; GCN-NEXT: s_xor_b32 s12, s12, s7 1894; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 1895; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 1896; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 1897; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 1898; GCN-NEXT: v_cvt_f32_u32_e32 v2, s17 1899; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 1900; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 1901; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 1902; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1903; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 1904; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 1905; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1906; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 1907; GCN-NEXT: v_mul_lo_u32 v4, v1, s17 1908; GCN-NEXT: v_mul_hi_u32 v5, v1, s17 1909; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v0 1910; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 1911; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v2 1912; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v2 1913; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v2 1914; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 1915; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 1916; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] 1917; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 1918; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 1919; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 1920; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1921; GCN-NEXT: s_ashr_i32 s0, s18, 31 1922; GCN-NEXT: s_add_i32 s1, s18, s0 1923; GCN-NEXT: s_xor_b32 s13, s1, s0 1924; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 1925; GCN-NEXT: v_cvt_f32_u32_e32 v2, s13 1926; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] 1927; GCN-NEXT: v_mul_hi_u32 v1, v1, s12 1928; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] 1929; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1930; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 1931; GCN-NEXT: v_mul_lo_u32 v1, v1, s17 1932; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 1933; GCN-NEXT: v_mul_f32_e32 v2, s16, v2 1934; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1935; GCN-NEXT: v_sub_i32_e32 v3, vcc, s12, v1 1936; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v1 1937; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 1938; GCN-NEXT: v_mul_lo_u32 v5, v2, s13 1939; GCN-NEXT: v_mul_hi_u32 v6, v2, s13 1940; GCN-NEXT: v_add_i32_e32 v4, vcc, s17, v3 1941; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s17, v3 1942; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 1943; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 1944; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] 1945; GCN-NEXT: v_mul_hi_u32 v5, v5, v2 1946; GCN-NEXT: s_ashr_i32 s6, s14, 31 1947; GCN-NEXT: s_add_i32 s12, s14, s6 1948; GCN-NEXT: s_xor_b32 s12, s12, s6 1949; GCN-NEXT: v_add_i32_e32 v6, vcc, v5, v2 1950; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v5, v2 1951; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1952; GCN-NEXT: s_ashr_i32 s0, s19, 31 1953; GCN-NEXT: s_add_i32 s1, s19, s0 1954; GCN-NEXT: s_xor_b32 s14, s1, s0 1955; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 1956; GCN-NEXT: v_cvt_f32_u32_e32 v3, s14 1957; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] 1958; GCN-NEXT: v_mul_hi_u32 v2, v2, s12 1959; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] 1960; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1961; GCN-NEXT: v_xor_b32_e32 v1, s7, v1 1962; GCN-NEXT: v_mul_lo_u32 v2, v2, s13 1963; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 1964; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 1965; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1966; GCN-NEXT: s_ashr_i32 s7, s15, 31 1967; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v2 1968; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v2 1969; GCN-NEXT: v_mul_lo_u32 v6, v3, s14 1970; GCN-NEXT: v_mul_hi_u32 v7, v3, s14 1971; GCN-NEXT: s_add_i32 s12, s15, s7 1972; GCN-NEXT: s_xor_b32 s12, s12, s7 1973; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 1974; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 1975; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] 1976; GCN-NEXT: v_mul_hi_u32 v6, v6, v3 1977; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 1978; GCN-NEXT: v_add_i32_e32 v5, vcc, s13, v4 1979; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s13, v4 1980; GCN-NEXT: v_add_i32_e32 v7, vcc, v6, v3 1981; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 1982; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] 1983; GCN-NEXT: v_mul_hi_u32 v3, v3, s12 1984; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1985; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 1986; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] 1987; GCN-NEXT: v_mul_lo_u32 v3, v3, s14 1988; GCN-NEXT: v_xor_b32_e32 v2, s6, v2 1989; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s6, v2 1990; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v3 1991; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v3 1992; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v4 1993; GCN-NEXT: v_add_i32_e32 v5, vcc, s14, v4 1994; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v4 1995; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1996; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 1997; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] 1998; GCN-NEXT: v_xor_b32_e32 v3, s7, v3 1999; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s7, v3 2000; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 2001; GCN-NEXT: s_endpgm 2002 %r = srem <4 x i32> %x, %y 2003 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2004 ret void 2005} 2006 2007define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2008; CHECK-LABEL: @udiv_v4i16( 2009; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2010; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2011; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2012; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2013; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2014; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2015; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2016; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2017; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2018; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2019; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2020; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2021; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2022; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2023; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2024; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2025; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2026; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2027; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2028; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 2029; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2030; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2031; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2032; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2033; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2034; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2035; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2036; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2037; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2038; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2039; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2040; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2041; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2042; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2043; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2044; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2045; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2046; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2047; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2048; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2049; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2050; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2051; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2052; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2053; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2054; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2055; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2056; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2057; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2058; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2059; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2060; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2061; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2062; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2063; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2064; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2065; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2066; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2067; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2068; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2069; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2070; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2071; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2072; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2073; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2074; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2075; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2076; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2077; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2078; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2079; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2080; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2081; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2082; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2083; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2084; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2085; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2086; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2087; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2088; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2089; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]] 2090; CHECK-NEXT: ret void 2091; 2092; GCN-LABEL: udiv_v4i16: 2093; GCN: ; %bb.0: 2094; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2095; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2096; GCN-NEXT: s_mov_b32 s8, 0xffff 2097; GCN-NEXT: s_mov_b32 s7, 0xf000 2098; GCN-NEXT: s_mov_b32 s6, -1 2099; GCN-NEXT: s_waitcnt lgkmcnt(0) 2100; GCN-NEXT: s_and_b32 s9, s2, s8 2101; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 2102; GCN-NEXT: s_lshr_b32 s9, s0, 16 2103; GCN-NEXT: s_and_b32 s0, s0, s8 2104; GCN-NEXT: s_lshr_b32 s2, s2, 16 2105; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 2106; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 2107; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2108; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 2109; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 2110; GCN-NEXT: s_and_b32 s2, s3, s8 2111; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2112; GCN-NEXT: v_trunc_f32_e32 v2, v2 2113; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2114; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2115; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2116; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 2117; GCN-NEXT: v_trunc_f32_e32 v1, v1 2118; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2119; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 2120; GCN-NEXT: v_cvt_f32_u32_e32 v4, s2 2121; GCN-NEXT: s_lshr_b32 s0, s1, 16 2122; GCN-NEXT: s_and_b32 s1, s1, s8 2123; GCN-NEXT: s_lshr_b32 s10, s3, 16 2124; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2125; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2126; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 2127; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 2128; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2129; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2130; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v3 2131; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2132; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 2133; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 2134; GCN-NEXT: v_trunc_f32_e32 v1, v1 2135; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 2136; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2137; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2138; GCN-NEXT: v_mul_f32_e32 v4, v6, v7 2139; GCN-NEXT: v_trunc_f32_e32 v4, v4 2140; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 2141; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2142; GCN-NEXT: v_mad_f32 v4, -v4, v3, v6 2143; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 2144; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 2145; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2146; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2147; GCN-NEXT: v_and_b32_e32 v1, s8, v1 2148; GCN-NEXT: v_or_b32_e32 v1, v1, v3 2149; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2150; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2151; GCN-NEXT: s_endpgm 2152 %r = udiv <4 x i16> %x, %y 2153 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2154 ret void 2155} 2156 2157define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2158; CHECK-LABEL: @urem_v4i16( 2159; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2160; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2161; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2162; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2163; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2164; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2165; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2166; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2167; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2168; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2169; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2170; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2171; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2172; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2173; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2174; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2175; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2176; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2177; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2178; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2179; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2180; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 2181; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2182; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2183; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2184; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2185; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2186; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2187; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2188; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2189; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2190; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2191; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2192; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2193; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2194; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2195; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2196; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2197; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2198; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2199; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2200; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2201; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2202; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2203; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2204; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2205; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2206; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2207; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2208; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2209; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2210; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2211; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2212; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2213; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2214; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2215; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2216; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2217; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2218; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2219; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2220; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2221; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2222; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2223; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2224; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2225; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2226; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2227; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2228; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2229; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2230; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2231; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2232; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2233; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2234; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2235; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2236; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2237; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2238; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2239; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2240; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2241; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2242; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2243; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2244; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2245; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2246; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2247; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]] 2248; CHECK-NEXT: ret void 2249; 2250; GCN-LABEL: urem_v4i16: 2251; GCN: ; %bb.0: 2252; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2253; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2254; GCN-NEXT: s_mov_b32 s8, 0xffff 2255; GCN-NEXT: s_mov_b32 s7, 0xf000 2256; GCN-NEXT: s_mov_b32 s6, -1 2257; GCN-NEXT: s_waitcnt lgkmcnt(0) 2258; GCN-NEXT: s_and_b32 s9, s2, s8 2259; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 2260; GCN-NEXT: s_and_b32 s10, s0, s8 2261; GCN-NEXT: s_lshr_b32 s11, s2, 16 2262; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 2263; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2264; GCN-NEXT: v_cvt_f32_u32_e32 v3, s11 2265; GCN-NEXT: s_lshr_b32 s9, s0, 16 2266; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 2267; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2268; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 2269; GCN-NEXT: v_trunc_f32_e32 v2, v2 2270; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2271; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2272; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2273; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 2274; GCN-NEXT: v_trunc_f32_e32 v1, v1 2275; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2276; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 2277; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 2278; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 2279; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2280; GCN-NEXT: s_and_b32 s2, s3, s8 2281; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2282; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 2283; GCN-NEXT: s_and_b32 s2, s1, s8 2284; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 2285; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 2286; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2287; GCN-NEXT: s_lshr_b32 s12, s3, 16 2288; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2289; GCN-NEXT: s_lshr_b32 s10, s1, 16 2290; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 2291; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 2292; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 2293; GCN-NEXT: v_trunc_f32_e32 v1, v1 2294; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2295; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 2296; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 2297; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2298; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2299; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 2300; GCN-NEXT: v_trunc_f32_e32 v2, v2 2301; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2302; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2303; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 2304; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2305; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2306; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2307; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 2308; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2309; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2310; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2311; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2312; GCN-NEXT: v_and_b32_e32 v1, s8, v1 2313; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2314; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2315; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2316; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2317; GCN-NEXT: s_endpgm 2318 %r = urem <4 x i16> %x, %y 2319 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2320 ret void 2321} 2322 2323define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2324; CHECK-LABEL: @sdiv_v4i16( 2325; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2326; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2327; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2328; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2329; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2330; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2331; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2332; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2333; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2334; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2335; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2336; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2337; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2338; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2339; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2340; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2341; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2342; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2343; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2344; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2345; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2346; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2347; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2348; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2349; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2350; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2351; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2352; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2353; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2354; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2355; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2356; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2357; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2358; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2359; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2360; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2361; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2362; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2363; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2364; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2365; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2366; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2367; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2368; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2369; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2370; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2371; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2372; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2373; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2374; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2375; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2376; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2377; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2378; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2379; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2380; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2381; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2382; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2383; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2384; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2385; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2386; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2387; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2388; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2389; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2390; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2391; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2392; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2393; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2394; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2395; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2396; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2397; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2398; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2399; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2400; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2401; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2402; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2403; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2404; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2405; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2406; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2407; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2408; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2409; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2410; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2411; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2412; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2413; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2414; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2415; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2416; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2417; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2418; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2419; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2420; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2421; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]] 2422; CHECK-NEXT: ret void 2423; 2424; GCN-LABEL: sdiv_v4i16: 2425; GCN: ; %bb.0: 2426; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2427; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2428; GCN-NEXT: s_mov_b32 s7, 0xf000 2429; GCN-NEXT: s_mov_b32 s6, -1 2430; GCN-NEXT: s_waitcnt lgkmcnt(0) 2431; GCN-NEXT: s_sext_i32_i16 s8, s2 2432; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2433; GCN-NEXT: s_sext_i32_i16 s9, s0 2434; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2435; GCN-NEXT: s_xor_b32 s8, s9, s8 2436; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2437; GCN-NEXT: s_ashr_i32 s2, s2, 16 2438; GCN-NEXT: s_ashr_i32 s8, s8, 30 2439; GCN-NEXT: s_or_b32 s8, s8, 1 2440; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2441; GCN-NEXT: v_trunc_f32_e32 v2, v2 2442; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2443; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2444; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2445; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2446; GCN-NEXT: v_mov_b32_e32 v3, s8 2447; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2448; GCN-NEXT: s_ashr_i32 s0, s0, 16 2449; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2450; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2451; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2452; GCN-NEXT: s_xor_b32 s0, s0, s2 2453; GCN-NEXT: s_ashr_i32 s0, s0, 30 2454; GCN-NEXT: s_or_b32 s0, s0, 1 2455; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2456; GCN-NEXT: v_trunc_f32_e32 v3, v3 2457; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2458; GCN-NEXT: v_mov_b32_e32 v4, s0 2459; GCN-NEXT: s_sext_i32_i16 s0, s3 2460; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2461; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2462; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2463; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2464; GCN-NEXT: s_sext_i32_i16 s2, s1 2465; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 2466; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2467; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2468; GCN-NEXT: s_xor_b32 s0, s2, s0 2469; GCN-NEXT: s_ashr_i32 s0, s0, 30 2470; GCN-NEXT: s_or_b32 s0, s0, 1 2471; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2472; GCN-NEXT: v_trunc_f32_e32 v4, v4 2473; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2474; GCN-NEXT: v_mov_b32_e32 v5, s0 2475; GCN-NEXT: s_ashr_i32 s0, s3, 16 2476; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2477; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2478; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2479; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2480; GCN-NEXT: s_ashr_i32 s1, s1, 16 2481; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2482; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 2483; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2484; GCN-NEXT: s_xor_b32 s0, s1, s0 2485; GCN-NEXT: s_ashr_i32 s0, s0, 30 2486; GCN-NEXT: s_or_b32 s0, s0, 1 2487; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2488; GCN-NEXT: v_trunc_f32_e32 v5, v5 2489; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2490; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2491; GCN-NEXT: v_mov_b32_e32 v6, s0 2492; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2493; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2494; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2495; GCN-NEXT: s_mov_b32 s0, 0xffff 2496; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2497; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2498; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2499; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2500; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2501; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2502; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2503; GCN-NEXT: s_endpgm 2504 %r = sdiv <4 x i16> %x, %y 2505 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2506 ret void 2507} 2508 2509define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2510; CHECK-LABEL: @srem_v4i16( 2511; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2512; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2513; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2514; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2515; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2516; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2517; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2518; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2519; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2520; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2521; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2522; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2523; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2524; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2525; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2526; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2527; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2528; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2529; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2530; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2531; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 2532; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 2533; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 2534; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 2535; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 2536; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 2537; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 2538; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2539; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 2540; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 2541; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 2542; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 2543; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 2544; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 2545; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 2546; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 2547; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 2548; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 2549; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 2550; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 2551; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 2552; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 2553; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 2554; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 2555; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 2556; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 2557; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 2558; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 2559; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 2560; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 2561; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 2562; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 2563; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 2564; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2565; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 2566; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 2567; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 2568; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 2569; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 2570; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 2571; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 2572; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 2573; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 2574; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 2575; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 2576; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 2577; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 2578; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2579; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 2580; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 2581; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 2582; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 2583; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 2584; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 2585; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 2586; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 2587; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 2588; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 2589; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 2590; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2591; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 2592; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 2593; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 2594; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 2595; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 2596; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 2597; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 2598; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 2599; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 2600; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 2601; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 2602; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 2603; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 2604; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 2605; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 2606; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 2607; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 2608; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 2609; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 2610; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 2611; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 2612; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 2613; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 2614; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 2615; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]] 2616; CHECK-NEXT: ret void 2617; 2618; GCN-LABEL: srem_v4i16: 2619; GCN: ; %bb.0: 2620; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2621; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2622; GCN-NEXT: s_mov_b32 s7, 0xf000 2623; GCN-NEXT: s_mov_b32 s6, -1 2624; GCN-NEXT: s_waitcnt lgkmcnt(0) 2625; GCN-NEXT: s_sext_i32_i16 s8, s2 2626; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2627; GCN-NEXT: s_sext_i32_i16 s9, s0 2628; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2629; GCN-NEXT: s_xor_b32 s8, s9, s8 2630; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2631; GCN-NEXT: s_ashr_i32 s8, s8, 30 2632; GCN-NEXT: s_or_b32 s8, s8, 1 2633; GCN-NEXT: v_mov_b32_e32 v3, s8 2634; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2635; GCN-NEXT: v_trunc_f32_e32 v2, v2 2636; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2637; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2638; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2639; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2640; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2641; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2642; GCN-NEXT: s_ashr_i32 s2, s2, 16 2643; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2644; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2645; GCN-NEXT: s_ashr_i32 s0, s0, 16 2646; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2647; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2648; GCN-NEXT: s_xor_b32 s8, s0, s2 2649; GCN-NEXT: s_ashr_i32 s8, s8, 30 2650; GCN-NEXT: s_or_b32 s8, s8, 1 2651; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2652; GCN-NEXT: v_trunc_f32_e32 v3, v3 2653; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2654; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2655; GCN-NEXT: v_mov_b32_e32 v4, s8 2656; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2657; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2658; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 2659; GCN-NEXT: v_mul_lo_u32 v1, v1, s2 2660; GCN-NEXT: s_sext_i32_i16 s2, s3 2661; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 2662; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 2663; GCN-NEXT: s_sext_i32_i16 s0, s1 2664; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2665; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2666; GCN-NEXT: s_xor_b32 s0, s0, s2 2667; GCN-NEXT: s_ashr_i32 s0, s0, 30 2668; GCN-NEXT: s_or_b32 s0, s0, 1 2669; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2670; GCN-NEXT: v_trunc_f32_e32 v4, v4 2671; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2672; GCN-NEXT: v_mov_b32_e32 v5, s0 2673; GCN-NEXT: s_ashr_i32 s0, s3, 16 2674; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2675; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2676; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2677; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2678; GCN-NEXT: s_ashr_i32 s2, s1, 16 2679; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2680; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 2681; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2682; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2683; GCN-NEXT: s_xor_b32 s3, s2, s0 2684; GCN-NEXT: s_ashr_i32 s3, s3, 30 2685; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2686; GCN-NEXT: v_trunc_f32_e32 v5, v5 2687; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2688; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2689; GCN-NEXT: s_or_b32 s3, s3, 1 2690; GCN-NEXT: v_mov_b32_e32 v6, s3 2691; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2692; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2693; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2694; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 2695; GCN-NEXT: s_mov_b32 s0, 0xffff 2696; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2697; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2698; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 2699; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2700; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2701; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2702; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2703; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2704; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2705; GCN-NEXT: s_endpgm 2706 %r = srem <4 x i16> %x, %y 2707 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2708 ret void 2709} 2710 2711define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2712; CHECK-LABEL: @udiv_i3( 2713; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2714; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2715; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2716; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2717; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2718; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2719; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2720; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2721; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2722; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2723; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2724; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2725; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2726; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2727; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2728; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 2729; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 2730; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]] 2731; CHECK-NEXT: ret void 2732; 2733; GCN-LABEL: udiv_i3: 2734; GCN: ; %bb.0: 2735; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2736; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2737; GCN-NEXT: s_mov_b32 s7, 0xf000 2738; GCN-NEXT: s_mov_b32 s6, -1 2739; GCN-NEXT: s_waitcnt lgkmcnt(0) 2740; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2741; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2742; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2743; GCN-NEXT: s_and_b32 s0, s0, 7 2744; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 2745; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2746; GCN-NEXT: v_trunc_f32_e32 v1, v1 2747; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2748; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2749; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2750; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2751; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2752; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2753; GCN-NEXT: s_endpgm 2754 %r = udiv i3 %x, %y 2755 store i3 %r, i3 addrspace(1)* %out 2756 ret void 2757} 2758 2759define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2760; CHECK-LABEL: @urem_i3( 2761; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2762; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2763; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2764; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2765; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2766; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2767; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2768; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2769; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2770; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2771; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2772; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2773; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2774; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2775; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2776; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 2777; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 2778; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 2779; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 2780; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]] 2781; CHECK-NEXT: ret void 2782; 2783; GCN-LABEL: urem_i3: 2784; GCN: ; %bb.0: 2785; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2786; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2787; GCN-NEXT: s_mov_b32 s7, 0xf000 2788; GCN-NEXT: s_mov_b32 s6, -1 2789; GCN-NEXT: s_waitcnt lgkmcnt(0) 2790; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2791; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2792; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2793; GCN-NEXT: s_and_b32 s2, s0, 7 2794; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 2795; GCN-NEXT: s_lshr_b32 s1, s0, 8 2796; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2797; GCN-NEXT: v_trunc_f32_e32 v1, v1 2798; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2799; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2800; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2801; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2802; GCN-NEXT: v_mul_lo_u32 v0, v0, s1 2803; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2804; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2805; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2806; GCN-NEXT: s_endpgm 2807 %r = urem i3 %x, %y 2808 store i3 %r, i3 addrspace(1)* %out 2809 ret void 2810} 2811 2812define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2813; CHECK-LABEL: @sdiv_i3( 2814; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2815; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2816; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2817; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2818; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2819; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2820; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2821; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2822; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2823; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2824; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2825; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2826; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2827; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2828; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2829; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2830; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2831; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2832; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 2833; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 2834; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 2835; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]] 2836; CHECK-NEXT: ret void 2837; 2838; GCN-LABEL: sdiv_i3: 2839; GCN: ; %bb.0: 2840; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2841; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2842; GCN-NEXT: s_mov_b32 s7, 0xf000 2843; GCN-NEXT: s_mov_b32 s6, -1 2844; GCN-NEXT: s_waitcnt lgkmcnt(0) 2845; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2846; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2847; GCN-NEXT: s_bfe_i32 s0, s0, 0x30000 2848; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2849; GCN-NEXT: s_xor_b32 s0, s0, s1 2850; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2851; GCN-NEXT: s_ashr_i32 s0, s0, 30 2852; GCN-NEXT: s_or_b32 s0, s0, 1 2853; GCN-NEXT: v_mov_b32_e32 v3, s0 2854; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2855; GCN-NEXT: v_trunc_f32_e32 v2, v2 2856; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2857; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2858; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2859; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2860; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2861; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2862; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2863; GCN-NEXT: s_endpgm 2864 %r = sdiv i3 %x, %y 2865 store i3 %r, i3 addrspace(1)* %out 2866 ret void 2867} 2868 2869define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2870; CHECK-LABEL: @srem_i3( 2871; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2872; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2873; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2874; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2875; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2876; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2877; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2878; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2879; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2880; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2881; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2882; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2883; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2884; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2885; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2886; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2887; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2888; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2889; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 2890; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 2891; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 2892; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 2893; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 2894; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]] 2895; CHECK-NEXT: ret void 2896; 2897; GCN-LABEL: srem_i3: 2898; GCN: ; %bb.0: 2899; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2900; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2901; GCN-NEXT: s_mov_b32 s7, 0xf000 2902; GCN-NEXT: s_mov_b32 s6, -1 2903; GCN-NEXT: s_waitcnt lgkmcnt(0) 2904; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2905; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2906; GCN-NEXT: s_bfe_i32 s3, s0, 0x30000 2907; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 2908; GCN-NEXT: s_xor_b32 s1, s3, s1 2909; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2910; GCN-NEXT: s_ashr_i32 s1, s1, 30 2911; GCN-NEXT: s_or_b32 s1, s1, 1 2912; GCN-NEXT: v_mov_b32_e32 v3, s1 2913; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2914; GCN-NEXT: v_trunc_f32_e32 v2, v2 2915; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2916; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2917; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2918; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2919; GCN-NEXT: s_lshr_b32 s2, s0, 8 2920; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2921; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2922; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2923; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2924; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2925; GCN-NEXT: s_endpgm 2926 %r = srem i3 %x, %y 2927 store i3 %r, i3 addrspace(1)* %out 2928 ret void 2929} 2930 2931define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2932; CHECK-LABEL: @udiv_v3i16( 2933; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2934; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2935; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2936; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2937; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2938; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2939; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2940; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2941; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2942; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2943; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2944; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2945; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2946; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2947; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2948; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2949; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2950; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2951; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2952; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 2953; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 2954; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2955; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2956; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2957; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2958; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2959; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2960; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2961; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2962; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2963; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2964; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2965; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2966; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2967; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2968; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2969; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2970; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2971; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2972; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2973; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 2974; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2975; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2976; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2977; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2978; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2979; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2980; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2981; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2982; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2983; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2984; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2985; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2986; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2987; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2988; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2989; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2990; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2991; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2992; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2993; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]] 2994; CHECK-NEXT: ret void 2995; 2996; GCN-LABEL: udiv_v3i16: 2997; GCN: ; %bb.0: 2998; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2999; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3000; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3001; GCN-NEXT: s_mov_b32 s8, 0xffff 3002; GCN-NEXT: s_mov_b32 s7, 0xf000 3003; GCN-NEXT: s_waitcnt lgkmcnt(0) 3004; GCN-NEXT: s_and_b32 s6, s0, s8 3005; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 3006; GCN-NEXT: s_and_b32 s6, s2, s8 3007; GCN-NEXT: s_lshr_b32 s0, s0, 16 3008; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 3009; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 3010; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3011; GCN-NEXT: s_lshr_b32 s0, s2, 16 3012; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 3013; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3014; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3015; GCN-NEXT: v_trunc_f32_e32 v2, v2 3016; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3017; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 3018; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3019; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 3020; GCN-NEXT: v_trunc_f32_e32 v1, v1 3021; GCN-NEXT: s_and_b32 s0, s1, s8 3022; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3023; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 3024; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 3025; GCN-NEXT: s_and_b32 s0, s3, s8 3026; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 3027; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3028; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 3029; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3030; GCN-NEXT: s_mov_b32 s6, -1 3031; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3032; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 3033; GCN-NEXT: v_trunc_f32_e32 v2, v2 3034; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 3035; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 3036; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3037; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3038; GCN-NEXT: v_and_b32_e32 v0, s8, v0 3039; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3040; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3041; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3042; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3043; GCN-NEXT: s_endpgm 3044 %r = udiv <3 x i16> %x, %y 3045 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3046 ret void 3047} 3048 3049define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3050; CHECK-LABEL: @urem_v3i16( 3051; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3052; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3053; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3054; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3055; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3056; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3057; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3058; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3059; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3060; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3061; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3062; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3063; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3064; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3065; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3066; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3067; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3068; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3069; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3070; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3071; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3072; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 3073; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3074; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3075; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3076; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3077; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3078; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3079; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3080; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3081; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3082; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3083; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3084; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3085; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3086; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3087; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3088; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3089; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3090; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3091; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3092; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3093; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3094; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3095; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3096; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3097; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3098; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3099; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3100; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3101; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3102; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3103; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3104; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3105; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3106; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3107; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3108; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3109; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3110; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3111; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3112; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3113; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3114; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3115; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3116; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3117; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]] 3118; CHECK-NEXT: ret void 3119; 3120; GCN-LABEL: urem_v3i16: 3121; GCN: ; %bb.0: 3122; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3123; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3124; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3125; GCN-NEXT: s_mov_b32 s8, 0xffff 3126; GCN-NEXT: s_mov_b32 s7, 0xf000 3127; GCN-NEXT: s_waitcnt lgkmcnt(0) 3128; GCN-NEXT: v_mov_b32_e32 v1, s2 3129; GCN-NEXT: s_and_b32 s6, s0, s8 3130; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 3131; GCN-NEXT: s_and_b32 s6, s2, s8 3132; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 3133; GCN-NEXT: v_mov_b32_e32 v4, s0 3134; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 3135; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 16 3136; GCN-NEXT: v_and_b32_e32 v5, s8, v4 3137; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 3138; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 3139; GCN-NEXT: v_trunc_f32_e32 v3, v3 3140; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 3141; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 3142; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3143; GCN-NEXT: v_cvt_f32_u32_e32 v2, v5 3144; GCN-NEXT: v_and_b32_e32 v3, s8, v1 3145; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 3146; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 3147; GCN-NEXT: s_and_b32 s0, s1, s8 3148; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 3149; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 3150; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 3151; GCN-NEXT: s_and_b32 s0, s3, s8 3152; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 3153; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 3154; GCN-NEXT: v_trunc_f32_e32 v5, v5 3155; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 3156; GCN-NEXT: v_mad_f32 v3, -v5, v2, v3 3157; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 3158; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 3159; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3160; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 3161; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 3162; GCN-NEXT: v_trunc_f32_e32 v3, v3 3163; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 3164; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3 3165; GCN-NEXT: v_mad_f32 v3, -v3, v6, v7 3166; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 3167; GCN-NEXT: s_mov_b32 s6, -1 3168; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 3169; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3170; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3171; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3172; GCN-NEXT: v_and_b32_e32 v0, s8, v0 3173; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 3174; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3175; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3176; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3177; GCN-NEXT: s_endpgm 3178 %r = urem <3 x i16> %x, %y 3179 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3180 ret void 3181} 3182 3183define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3184; CHECK-LABEL: @sdiv_v3i16( 3185; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3186; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3187; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3188; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3189; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3190; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3191; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3192; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3193; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3194; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3195; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3196; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3197; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3198; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3199; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3200; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3201; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3202; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3203; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3204; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3205; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 3206; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 3207; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 3208; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 3209; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 3210; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3211; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 3212; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 3213; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3214; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3215; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3216; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3217; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3218; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3219; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3220; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3221; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3222; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3223; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3224; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3225; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3226; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3227; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3228; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3229; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 3230; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 3231; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 3232; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 3233; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 3234; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3235; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 3236; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 3237; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3238; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3239; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3240; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3241; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3242; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3243; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3244; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3245; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3246; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3247; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3248; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3249; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3250; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3251; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3252; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3253; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 3254; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 3255; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 3256; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 3257; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]] 3258; CHECK-NEXT: ret void 3259; 3260; GCN-LABEL: sdiv_v3i16: 3261; GCN: ; %bb.0: 3262; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3263; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3264; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3265; GCN-NEXT: s_mov_b32 s7, 0xf000 3266; GCN-NEXT: s_mov_b32 s6, -1 3267; GCN-NEXT: s_waitcnt lgkmcnt(0) 3268; GCN-NEXT: s_sext_i32_i16 s9, s2 3269; GCN-NEXT: s_sext_i32_i16 s8, s0 3270; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 3271; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 3272; GCN-NEXT: s_xor_b32 s8, s9, s8 3273; GCN-NEXT: s_ashr_i32 s0, s0, 16 3274; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3275; GCN-NEXT: s_ashr_i32 s8, s8, 30 3276; GCN-NEXT: s_or_b32 s8, s8, 1 3277; GCN-NEXT: v_mov_b32_e32 v3, s8 3278; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3279; GCN-NEXT: v_trunc_f32_e32 v2, v2 3280; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3281; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3282; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3283; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 3284; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3285; GCN-NEXT: s_ashr_i32 s2, s2, 16 3286; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3287; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 3288; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 3289; GCN-NEXT: s_xor_b32 s0, s2, s0 3290; GCN-NEXT: s_ashr_i32 s0, s0, 30 3291; GCN-NEXT: s_or_b32 s0, s0, 1 3292; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 3293; GCN-NEXT: v_trunc_f32_e32 v3, v3 3294; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 3295; GCN-NEXT: v_mov_b32_e32 v4, s0 3296; GCN-NEXT: s_sext_i32_i16 s0, s1 3297; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3298; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 3299; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 3300; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3301; GCN-NEXT: s_sext_i32_i16 s1, s3 3302; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 3303; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3304; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3305; GCN-NEXT: s_xor_b32 s0, s1, s0 3306; GCN-NEXT: s_ashr_i32 s0, s0, 30 3307; GCN-NEXT: s_or_b32 s0, s0, 1 3308; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3309; GCN-NEXT: v_trunc_f32_e32 v4, v4 3310; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3311; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3312; GCN-NEXT: v_mov_b32_e32 v5, s0 3313; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3314; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3315; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3316; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3317; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3318; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3319; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3320; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3321; GCN-NEXT: s_endpgm 3322 %r = sdiv <3 x i16> %x, %y 3323 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3324 ret void 3325} 3326 3327define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3328; CHECK-LABEL: @srem_v3i16( 3329; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3330; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3331; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3332; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3333; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3334; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3335; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3336; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3337; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3338; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3339; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3340; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3341; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3342; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3343; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3344; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3345; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3346; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3347; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3348; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3349; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3350; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3351; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3352; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3353; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3354; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 3355; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 3356; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3357; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3358; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3359; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3360; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3361; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3362; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3363; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3364; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3365; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3366; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3367; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3368; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3369; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3370; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3371; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3372; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3373; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3374; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3375; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3376; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3377; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3378; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3379; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3380; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3381; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 3382; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3383; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3384; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3385; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3386; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3387; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3388; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3389; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3390; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3391; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3392; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3393; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3394; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3395; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3396; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3397; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3398; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3399; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3400; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3401; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3402; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3403; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3404; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3405; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3406; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3407; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]] 3408; CHECK-NEXT: ret void 3409; 3410; GCN-LABEL: srem_v3i16: 3411; GCN: ; %bb.0: 3412; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3413; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3414; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3415; GCN-NEXT: s_mov_b32 s7, 0xf000 3416; GCN-NEXT: s_waitcnt lgkmcnt(0) 3417; GCN-NEXT: s_sext_i32_i16 s8, s2 3418; GCN-NEXT: s_sext_i32_i16 s6, s0 3419; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 3420; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 3421; GCN-NEXT: s_xor_b32 s6, s8, s6 3422; GCN-NEXT: s_ashr_i32 s6, s6, 30 3423; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3424; GCN-NEXT: s_or_b32 s6, s6, 1 3425; GCN-NEXT: v_mov_b32_e32 v3, s6 3426; GCN-NEXT: s_mov_b32 s6, -1 3427; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3428; GCN-NEXT: v_trunc_f32_e32 v2, v2 3429; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3430; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3431; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3432; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3433; GCN-NEXT: v_mov_b32_e32 v1, s2 3434; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3435; GCN-NEXT: v_mov_b32_e32 v2, s0 3436; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 3437; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 3438; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 3439; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 3440; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 3441; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 3442; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 3443; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 3444; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 3445; GCN-NEXT: s_sext_i32_i16 s0, s1 3446; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 3447; GCN-NEXT: v_trunc_f32_e32 v5, v5 3448; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 3449; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 3450; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3451; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3452; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3453; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 3454; GCN-NEXT: v_or_b32_e32 v3, 1, v3 3455; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3456; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3457; GCN-NEXT: s_sext_i32_i16 s2, s3 3458; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3459; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 3460; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 3461; GCN-NEXT: s_xor_b32 s0, s2, s0 3462; GCN-NEXT: s_ashr_i32 s0, s0, 30 3463; GCN-NEXT: s_or_b32 s0, s0, 1 3464; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 3465; GCN-NEXT: v_trunc_f32_e32 v5, v5 3466; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 3467; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3468; GCN-NEXT: v_mov_b32_e32 v6, s0 3469; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 3470; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3471; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3472; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3473; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3474; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3475; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3476; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 3477; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3478; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3479; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3480; GCN-NEXT: s_endpgm 3481 %r = srem <3 x i16> %x, %y 3482 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3483 ret void 3484} 3485 3486define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3487; CHECK-LABEL: @udiv_v3i15( 3488; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3489; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3490; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3491; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3492; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3493; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3494; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3495; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3496; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3497; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3498; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3499; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3500; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3501; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3502; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3503; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3504; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3505; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 3506; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 3507; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 3508; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 3509; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3510; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 3511; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 3512; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3513; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3514; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3515; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3516; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3517; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3518; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3519; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3520; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3521; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3522; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3523; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3524; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3525; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 3526; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 3527; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 3528; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 3529; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3530; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 3531; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 3532; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3533; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3534; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3535; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3536; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3537; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3538; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3539; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3540; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3541; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3542; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3543; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3544; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3545; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 3546; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 3547; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 3548; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]] 3549; CHECK-NEXT: ret void 3550; 3551; GCN-LABEL: udiv_v3i15: 3552; GCN: ; %bb.0: 3553; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3554; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3555; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3556; GCN-NEXT: s_mov_b32 s7, 0xf000 3557; GCN-NEXT: s_mov_b32 s6, -1 3558; GCN-NEXT: s_waitcnt lgkmcnt(0) 3559; GCN-NEXT: v_mov_b32_e32 v0, s2 3560; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3561; GCN-NEXT: s_movk_i32 s3, 0x7fff 3562; GCN-NEXT: s_and_b32 s9, s0, s3 3563; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 3564; GCN-NEXT: v_mov_b32_e32 v2, s0 3565; GCN-NEXT: s_and_b32 s8, s2, s3 3566; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f 3567; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 3568; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 3569; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3570; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f 3571; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3572; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 3573; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3574; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 3575; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3576; GCN-NEXT: v_trunc_f32_e32 v4, v4 3577; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3578; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3579; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 3580; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3581; GCN-NEXT: v_mul_f32_e32 v1, v6, v7 3582; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3583; GCN-NEXT: v_trunc_f32_e32 v1, v1 3584; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 3585; GCN-NEXT: v_mad_f32 v4, -v1, v5, v6 3586; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3587; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 3588; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 3589; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 3590; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3591; GCN-NEXT: v_mul_f32_e32 v1, v0, v6 3592; GCN-NEXT: v_trunc_f32_e32 v1, v1 3593; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 3594; GCN-NEXT: v_mad_f32 v0, -v1, v2, v0 3595; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 3596; GCN-NEXT: v_and_b32_e32 v2, s3, v3 3597; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 3598; GCN-NEXT: v_and_b32_e32 v3, s3, v4 3599; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3600; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3601; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3602; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3603; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3604; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3605; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3606; GCN-NEXT: s_endpgm 3607 %r = udiv <3 x i15> %x, %y 3608 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3609 ret void 3610} 3611 3612define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3613; CHECK-LABEL: @urem_v3i15( 3614; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3615; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3616; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3617; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3618; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3619; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3620; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3621; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3622; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3623; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3624; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3625; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3626; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3627; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3628; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3629; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3630; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3631; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3632; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3633; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 3634; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 3635; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 3636; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 3637; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3638; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 3639; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 3640; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3641; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3642; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3643; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3644; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3645; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3646; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3647; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3648; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3649; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3650; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3651; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3652; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3653; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3654; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3655; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 3656; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 3657; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 3658; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 3659; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3660; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 3661; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 3662; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3663; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3664; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3665; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3666; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3667; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3668; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3669; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3670; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3671; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3672; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3673; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3674; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3675; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3676; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3677; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 3678; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 3679; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 3680; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]] 3681; CHECK-NEXT: ret void 3682; 3683; GCN-LABEL: urem_v3i15: 3684; GCN: ; %bb.0: 3685; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3686; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3687; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3688; GCN-NEXT: s_mov_b32 s7, 0xf000 3689; GCN-NEXT: s_mov_b32 s6, -1 3690; GCN-NEXT: s_waitcnt lgkmcnt(0) 3691; GCN-NEXT: v_mov_b32_e32 v0, s2 3692; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3693; GCN-NEXT: s_movk_i32 s3, 0x7fff 3694; GCN-NEXT: s_and_b32 s10, s0, s3 3695; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 3696; GCN-NEXT: s_and_b32 s9, s2, s3 3697; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 3698; GCN-NEXT: v_mov_b32_e32 v2, s0 3699; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3700; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3701; GCN-NEXT: s_bfe_u32 s1, s0, 0xf000f 3702; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 3703; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3704; GCN-NEXT: v_trunc_f32_e32 v4, v4 3705; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3706; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3707; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3708; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3709; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 3710; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 3711; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3712; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 3713; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3714; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3715; GCN-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 3716; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 3717; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 3718; GCN-NEXT: v_cvt_f32_u32_e32 v7, v0 3719; GCN-NEXT: v_trunc_f32_e32 v1, v1 3720; GCN-NEXT: v_mad_f32 v3, -v1, v5, v3 3721; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 3722; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3723; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3724; GCN-NEXT: s_lshr_b32 s0, s0, 15 3725; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 3726; GCN-NEXT: v_trunc_f32_e32 v3, v3 3727; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 3728; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3729; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 3730; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3731; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3732; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3733; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3734; GCN-NEXT: s_lshr_b32 s8, s2, 15 3735; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 3736; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 3737; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3738; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3739; GCN-NEXT: v_and_b32_e32 v2, s3, v6 3740; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3741; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3742; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3743; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3744; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3745; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3746; GCN-NEXT: s_endpgm 3747 %r = urem <3 x i15> %x, %y 3748 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3749 ret void 3750} 3751 3752define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3753; CHECK-LABEL: @sdiv_v3i15( 3754; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3755; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3756; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3757; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3758; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3759; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3760; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3761; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3762; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3763; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3764; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3765; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3766; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3767; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3768; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3769; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3770; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3771; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3772; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3773; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3774; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 3775; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 3776; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 3777; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 3778; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 3779; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3780; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 3781; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 3782; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3783; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3784; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3785; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3786; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3787; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3788; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3789; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3790; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3791; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3792; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3793; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3794; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3795; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3796; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3797; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3798; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 3799; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 3800; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 3801; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 3802; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 3803; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3804; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 3805; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 3806; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3807; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3808; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3809; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3810; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3811; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3812; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3813; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3814; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3815; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3816; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3817; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3818; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3819; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3820; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3821; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3822; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 3823; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 3824; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 3825; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 3826; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]] 3827; CHECK-NEXT: ret void 3828; 3829; GCN-LABEL: sdiv_v3i15: 3830; GCN: ; %bb.0: 3831; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3832; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3833; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3834; GCN-NEXT: s_mov_b32 s7, 0xf000 3835; GCN-NEXT: s_mov_b32 s6, -1 3836; GCN-NEXT: s_waitcnt lgkmcnt(0) 3837; GCN-NEXT: v_mov_b32_e32 v0, s2 3838; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3839; GCN-NEXT: s_bfe_i32 s3, s0, 0xf0000 3840; GCN-NEXT: v_cvt_f32_i32_e32 v2, s3 3841; GCN-NEXT: v_mov_b32_e32 v1, s0 3842; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3843; GCN-NEXT: s_bfe_i32 s1, s2, 0xf0000 3844; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3845; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3846; GCN-NEXT: s_xor_b32 s1, s1, s3 3847; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f 3848; GCN-NEXT: s_ashr_i32 s1, s1, 30 3849; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3850; GCN-NEXT: v_trunc_f32_e32 v4, v4 3851; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3852; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3853; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3854; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3855; GCN-NEXT: s_or_b32 s1, s1, 1 3856; GCN-NEXT: v_mov_b32_e32 v5, s1 3857; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3858; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f 3859; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3860; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 3861; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3862; GCN-NEXT: s_xor_b32 s0, s1, s0 3863; GCN-NEXT: v_bfe_i32 v1, v1, 0, 15 3864; GCN-NEXT: s_ashr_i32 s0, s0, 30 3865; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3866; GCN-NEXT: v_trunc_f32_e32 v5, v5 3867; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3868; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3869; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3870; GCN-NEXT: v_cvt_f32_i32_e32 v4, v1 3871; GCN-NEXT: s_or_b32 s0, s0, 1 3872; GCN-NEXT: v_mov_b32_e32 v6, s0 3873; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3874; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 3875; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3876; GCN-NEXT: v_cvt_f32_i32_e32 v5, v0 3877; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 3878; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 3879; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 3880; GCN-NEXT: v_or_b32_e32 v0, 1, v0 3881; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 3882; GCN-NEXT: v_trunc_f32_e32 v1, v1 3883; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 3884; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 3885; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 3886; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 3887; GCN-NEXT: s_movk_i32 s0, 0x7fff 3888; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3889; GCN-NEXT: v_and_b32_e32 v3, s0, v3 3890; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3891; GCN-NEXT: v_and_b32_e32 v2, s0, v2 3892; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3893; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3894; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3895; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3896; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3897; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3898; GCN-NEXT: s_endpgm 3899 %r = sdiv <3 x i15> %x, %y 3900 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3901 ret void 3902} 3903 3904define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3905; CHECK-LABEL: @srem_v3i15( 3906; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3907; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3908; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3909; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3910; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3911; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3912; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3913; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3914; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3915; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3916; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3917; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3918; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3919; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3920; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3921; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3922; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3923; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3924; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3925; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3926; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3927; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3928; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 3929; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 3930; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 3931; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 3932; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 3933; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3934; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 3935; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 3936; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3937; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3938; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3939; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3940; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3941; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3942; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3943; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3944; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3945; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3946; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3947; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3948; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3949; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3950; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3951; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3952; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3953; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3954; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 3955; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 3956; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 3957; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 3958; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 3959; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3960; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 3961; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 3962; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3963; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3964; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3965; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3966; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3967; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3968; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3969; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3970; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3971; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3972; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3973; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3974; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3975; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3976; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3977; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3978; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3979; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3980; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 3981; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 3982; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 3983; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 3984; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]] 3985; CHECK-NEXT: ret void 3986; 3987; GCN-LABEL: srem_v3i15: 3988; GCN: ; %bb.0: 3989; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3990; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3991; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3992; GCN-NEXT: s_mov_b32 s7, 0xf000 3993; GCN-NEXT: s_mov_b32 s6, -1 3994; GCN-NEXT: s_waitcnt lgkmcnt(0) 3995; GCN-NEXT: v_mov_b32_e32 v0, s2 3996; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3997; GCN-NEXT: s_movk_i32 s3, 0x7fff 3998; GCN-NEXT: s_and_b32 s11, s0, s3 3999; GCN-NEXT: s_bfe_i32 s11, s11, 0xf0000 4000; GCN-NEXT: v_cvt_f32_i32_e32 v2, s11 4001; GCN-NEXT: s_and_b32 s9, s2, s3 4002; GCN-NEXT: s_bfe_i32 s9, s9, 0xf0000 4003; GCN-NEXT: v_cvt_f32_i32_e32 v3, s9 4004; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 4005; GCN-NEXT: s_xor_b32 s9, s9, s11 4006; GCN-NEXT: s_ashr_i32 s9, s9, 30 4007; GCN-NEXT: s_or_b32 s9, s9, 1 4008; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 4009; GCN-NEXT: v_trunc_f32_e32 v4, v4 4010; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 4011; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 4012; GCN-NEXT: v_mov_b32_e32 v5, s9 4013; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4014; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 4015; GCN-NEXT: v_mov_b32_e32 v1, s0 4016; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 4017; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f 4018; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 4019; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 4020; GCN-NEXT: s_lshr_b32 s1, s0, 15 4021; GCN-NEXT: s_bfe_i32 s0, s12, 0xf0000 4022; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 4023; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 4024; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 4025; GCN-NEXT: s_lshr_b32 s8, s2, 15 4026; GCN-NEXT: s_bfe_i32 s2, s10, 0xf0000 4027; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 4028; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 4029; GCN-NEXT: s_xor_b32 s0, s2, s0 4030; GCN-NEXT: s_ashr_i32 s0, s0, 30 4031; GCN-NEXT: s_or_b32 s0, s0, 1 4032; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 4033; GCN-NEXT: v_trunc_f32_e32 v5, v5 4034; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 4035; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 4036; GCN-NEXT: v_and_b32_e32 v1, s3, v1 4037; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 4038; GCN-NEXT: v_mov_b32_e32 v6, s0 4039; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 4040; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 4041; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4042; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 4043; GCN-NEXT: v_and_b32_e32 v0, s3, v0 4044; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 4045; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 4046; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 4047; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 4048; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 4049; GCN-NEXT: v_or_b32_e32 v4, 1, v4 4050; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 4051; GCN-NEXT: v_trunc_f32_e32 v6, v6 4052; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 4053; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 4054; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 4055; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 4056; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 4057; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 4058; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 4059; GCN-NEXT: v_and_b32_e32 v2, s3, v2 4060; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 4061; GCN-NEXT: v_and_b32_e32 v3, s3, v3 4062; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 4063; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4064; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4065; GCN-NEXT: v_or_b32_e32 v2, v2, v3 4066; GCN-NEXT: v_or_b32_e32 v0, v2, v0 4067; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 4068; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4069; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 4070; GCN-NEXT: s_endpgm 4071 %r = srem <3 x i15> %x, %y 4072 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4073 ret void 4074} 4075 4076define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4077; CHECK-LABEL: @udiv_i32_oddk_denom( 4078; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 4079; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4080; CHECK-NEXT: ret void 4081; 4082; GCN-LABEL: udiv_i32_oddk_denom: 4083; GCN: ; %bb.0: 4084; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4085; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4086; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 4087; GCN-NEXT: s_mov_b32 s7, 0xf000 4088; GCN-NEXT: s_mov_b32 s6, -1 4089; GCN-NEXT: s_waitcnt lgkmcnt(0) 4090; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4091; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 4092; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4093; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4094; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 4095; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4096; GCN-NEXT: s_endpgm 4097 %r = udiv i32 %x, 1235195 4098 store i32 %r, i32 addrspace(1)* %out 4099 ret void 4100} 4101 4102define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4103; CHECK-LABEL: @udiv_i32_pow2k_denom( 4104; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 4105; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4106; CHECK-NEXT: ret void 4107; 4108; GCN-LABEL: udiv_i32_pow2k_denom: 4109; GCN: ; %bb.0: 4110; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4111; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4112; GCN-NEXT: s_mov_b32 s7, 0xf000 4113; GCN-NEXT: s_mov_b32 s6, -1 4114; GCN-NEXT: s_waitcnt lgkmcnt(0) 4115; GCN-NEXT: s_lshr_b32 s0, s0, 12 4116; GCN-NEXT: v_mov_b32_e32 v0, s0 4117; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4118; GCN-NEXT: s_endpgm 4119 %r = udiv i32 %x, 4096 4120 store i32 %r, i32 addrspace(1)* %out 4121 ret void 4122} 4123 4124define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4125; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 4126; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4127; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 4128; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4129; CHECK-NEXT: ret void 4130; 4131; GCN-LABEL: udiv_i32_pow2_shl_denom: 4132; GCN: ; %bb.0: 4133; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4134; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4135; GCN-NEXT: s_mov_b32 s7, 0xf000 4136; GCN-NEXT: s_mov_b32 s6, -1 4137; GCN-NEXT: s_waitcnt lgkmcnt(0) 4138; GCN-NEXT: s_add_i32 s1, s1, 12 4139; GCN-NEXT: s_lshr_b32 s0, s0, s1 4140; GCN-NEXT: v_mov_b32_e32 v0, s0 4141; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4142; GCN-NEXT: s_endpgm 4143 %shl.y = shl i32 4096, %y 4144 %r = udiv i32 %x, %shl.y 4145 store i32 %r, i32 addrspace(1)* %out 4146 ret void 4147} 4148 4149define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4150; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 4151; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4152; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 4153; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4154; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4155; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 4156; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4157; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4158; CHECK-NEXT: ret void 4159; 4160; GCN-LABEL: udiv_v2i32_pow2k_denom: 4161; GCN: ; %bb.0: 4162; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4163; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4164; GCN-NEXT: s_mov_b32 s7, 0xf000 4165; GCN-NEXT: s_mov_b32 s6, -1 4166; GCN-NEXT: s_waitcnt lgkmcnt(0) 4167; GCN-NEXT: s_lshr_b32 s0, s0, 12 4168; GCN-NEXT: s_lshr_b32 s1, s1, 12 4169; GCN-NEXT: v_mov_b32_e32 v0, s0 4170; GCN-NEXT: v_mov_b32_e32 v1, s1 4171; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4172; GCN-NEXT: s_endpgm 4173 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 4174 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4175 ret void 4176} 4177 4178define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4179; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 4180; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4181; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 4182; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4183; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4184; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 4185; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4186; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4187; CHECK-NEXT: ret void 4188; 4189; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: 4190; GCN: ; %bb.0: 4191; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4192; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4193; GCN-NEXT: v_mov_b32_e32 v0, 0x100101 4194; GCN-NEXT: s_mov_b32 s7, 0xf000 4195; GCN-NEXT: s_mov_b32 s6, -1 4196; GCN-NEXT: s_waitcnt lgkmcnt(0) 4197; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 4198; GCN-NEXT: s_lshr_b32 s0, s0, 12 4199; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 4200; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4201; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4202; GCN-NEXT: v_lshrrev_b32_e32 v1, 11, v0 4203; GCN-NEXT: v_mov_b32_e32 v0, s0 4204; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4205; GCN-NEXT: s_endpgm 4206 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 4207 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4208 ret void 4209} 4210 4211define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4212; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 4213; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4214; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4215; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4216; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 4217; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 4218; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000 4219; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 4220; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 4221; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64 4222; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 4223; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 4224; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 4225; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 4226; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP10]] 4227; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0 4228; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]] 4229; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 4230; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 4231; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4232; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4233; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4234; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4235; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]] 4236; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]] 4237; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]] 4238; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 4239; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP1]] to i64 4240; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]] 4241; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4242; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP27]], 32 4243; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32 4244; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]] 4245; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]] 4246; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]] 4247; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]] 4248; CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]] 4249; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP30]], 1 4250; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP30]], 1 4251; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP30]] 4252; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]] 4253; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i32> undef, i32 [[TMP39]], i64 0 4254; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[X]], i64 1 4255; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4256; CHECK-NEXT: [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float 4257; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]]) 4258; CHECK-NEXT: [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000 4259; CHECK-NEXT: [[TMP46:%.*]] = fptoui float [[TMP45]] to i32 4260; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP46]] to i64 4261; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP42]] to i64 4262; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]] 4263; CHECK-NEXT: [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32 4264; CHECK-NEXT: [[TMP51:%.*]] = lshr i64 [[TMP49]], 32 4265; CHECK-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32 4266; CHECK-NEXT: [[TMP53:%.*]] = sub i32 0, [[TMP50]] 4267; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0 4268; CHECK-NEXT: [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]] 4269; CHECK-NEXT: [[TMP56:%.*]] = zext i32 [[TMP55]] to i64 4270; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP46]] to i64 4271; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]] 4272; CHECK-NEXT: [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32 4273; CHECK-NEXT: [[TMP60:%.*]] = lshr i64 [[TMP58]], 32 4274; CHECK-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32 4275; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]] 4276; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]] 4277; CHECK-NEXT: [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]] 4278; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP64]] to i64 4279; CHECK-NEXT: [[TMP66:%.*]] = zext i32 [[TMP41]] to i64 4280; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]] 4281; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 4282; CHECK-NEXT: [[TMP69:%.*]] = lshr i64 [[TMP67]], 32 4283; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32 4284; CHECK-NEXT: [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]] 4285; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]] 4286; CHECK-NEXT: [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]] 4287; CHECK-NEXT: [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]] 4288; CHECK-NEXT: [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]] 4289; CHECK-NEXT: [[TMP76:%.*]] = add i32 [[TMP70]], 1 4290; CHECK-NEXT: [[TMP77:%.*]] = sub i32 [[TMP70]], 1 4291; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP70]] 4292; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] 4293; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1 4294; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4295; CHECK-NEXT: ret void 4296; 4297; GCN-LABEL: udiv_v2i32_pow2_shl_denom: 4298; GCN: ; %bb.0: 4299; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4300; GCN-NEXT: s_movk_i32 s4, 0x1000 4301; GCN-NEXT: s_mov_b32 s7, 0xf000 4302; GCN-NEXT: s_mov_b32 s6, -1 4303; GCN-NEXT: s_waitcnt lgkmcnt(0) 4304; GCN-NEXT: s_lshl_b32 s2, s4, s2 4305; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 4306; GCN-NEXT: s_lshl_b32 s10, s4, s3 4307; GCN-NEXT: s_mov_b32 s3, 0x4f800000 4308; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 4309; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4310; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4311; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4312; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4313; GCN-NEXT: v_mul_f32_e32 v0, s3, v0 4314; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4315; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 4316; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4317; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 4318; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 4319; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 4320; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 4321; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 4322; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 4323; GCN-NEXT: v_mul_lo_u32 v3, v1, s10 4324; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 4325; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 4326; GCN-NEXT: v_mul_hi_u32 v2, v1, s10 4327; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 4328; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 4329; GCN-NEXT: s_waitcnt lgkmcnt(0) 4330; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 4331; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 4332; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 4333; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 4334; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 4335; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v1 4336; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 4337; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 4338; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 4339; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v5 4340; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v3 4341; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 4342; GCN-NEXT: v_mul_lo_u32 v4, v1, s10 4343; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s8, v5 4344; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4345; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 4346; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4347; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v4 4348; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] 4349; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 4350; GCN-NEXT: v_add_i32_e32 v2, vcc, -1, v1 4351; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v4 4352; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4353; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 4354; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4355; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] 4356; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4357; GCN-NEXT: s_endpgm 4358 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4359 %r = udiv <2 x i32> %x, %shl.y 4360 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4361 ret void 4362} 4363 4364define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4365; CHECK-LABEL: @urem_i32_oddk_denom( 4366; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 4367; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4368; CHECK-NEXT: ret void 4369; 4370; GCN-LABEL: urem_i32_oddk_denom: 4371; GCN: ; %bb.0: 4372; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4373; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4374; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 4375; GCN-NEXT: s_mov_b32 s7, 0xf000 4376; GCN-NEXT: s_mov_b32 s6, -1 4377; GCN-NEXT: s_waitcnt lgkmcnt(0) 4378; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4379; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 4380; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4381; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4382; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 4383; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x12d8fb, v0 4384; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4385; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4386; GCN-NEXT: s_endpgm 4387 %r = urem i32 %x, 1235195 4388 store i32 %r, i32 addrspace(1)* %out 4389 ret void 4390} 4391 4392define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4393; CHECK-LABEL: @urem_i32_pow2k_denom( 4394; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 4395; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4396; CHECK-NEXT: ret void 4397; 4398; GCN-LABEL: urem_i32_pow2k_denom: 4399; GCN: ; %bb.0: 4400; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4401; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4402; GCN-NEXT: s_mov_b32 s7, 0xf000 4403; GCN-NEXT: s_mov_b32 s6, -1 4404; GCN-NEXT: s_waitcnt lgkmcnt(0) 4405; GCN-NEXT: s_and_b32 s0, s0, 0xfff 4406; GCN-NEXT: v_mov_b32_e32 v0, s0 4407; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4408; GCN-NEXT: s_endpgm 4409 %r = urem i32 %x, 4096 4410 store i32 %r, i32 addrspace(1)* %out 4411 ret void 4412} 4413 4414define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4415; CHECK-LABEL: @urem_i32_pow2_shl_denom( 4416; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4417; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 4418; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4419; CHECK-NEXT: ret void 4420; 4421; GCN-LABEL: urem_i32_pow2_shl_denom: 4422; GCN: ; %bb.0: 4423; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4424; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4425; GCN-NEXT: s_mov_b32 s7, 0xf000 4426; GCN-NEXT: s_mov_b32 s6, -1 4427; GCN-NEXT: s_waitcnt lgkmcnt(0) 4428; GCN-NEXT: s_lshl_b32 s1, 0x1000, s1 4429; GCN-NEXT: s_add_i32 s1, s1, -1 4430; GCN-NEXT: s_and_b32 s0, s0, s1 4431; GCN-NEXT: v_mov_b32_e32 v0, s0 4432; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4433; GCN-NEXT: s_endpgm 4434 %shl.y = shl i32 4096, %y 4435 %r = urem i32 %x, %shl.y 4436 store i32 %r, i32 addrspace(1)* %out 4437 ret void 4438} 4439 4440define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4441; CHECK-LABEL: @urem_v2i32_pow2k_denom( 4442; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4443; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 4444; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4445; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4446; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 4447; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4448; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4449; CHECK-NEXT: ret void 4450; 4451; GCN-LABEL: urem_v2i32_pow2k_denom: 4452; GCN: ; %bb.0: 4453; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4454; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4455; GCN-NEXT: s_movk_i32 s2, 0xfff 4456; GCN-NEXT: s_mov_b32 s7, 0xf000 4457; GCN-NEXT: s_mov_b32 s6, -1 4458; GCN-NEXT: s_waitcnt lgkmcnt(0) 4459; GCN-NEXT: s_and_b32 s0, s0, s2 4460; GCN-NEXT: s_and_b32 s1, s1, s2 4461; GCN-NEXT: v_mov_b32_e32 v0, s0 4462; GCN-NEXT: v_mov_b32_e32 v1, s1 4463; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4464; GCN-NEXT: s_endpgm 4465 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 4466 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4467 ret void 4468} 4469 4470define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4471; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 4472; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4473; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4474; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4475; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 4476; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 4477; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41F0000000000000 4478; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 4479; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 4480; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64 4481; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 4482; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 4483; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 4484; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 4485; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP10]] 4486; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0 4487; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP10]] 4488; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 4489; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 4490; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4491; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4492; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4493; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4494; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP6]], [[TMP21]] 4495; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP6]], [[TMP21]] 4496; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP14]], i32 [[TMP22]], i32 [[TMP23]] 4497; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64 4498; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP1]] to i64 4499; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP25]], [[TMP26]] 4500; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4501; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP27]], 32 4502; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32 4503; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], [[TMP2]] 4504; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP1]], [[TMP31]] 4505; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP2]] 4506; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP1]], [[TMP31]] 4507; CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP33]], [[TMP34]] 4508; CHECK-NEXT: [[TMP36:%.*]] = sub i32 [[TMP32]], [[TMP2]] 4509; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP2]] 4510; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP36]], i32 [[TMP32]] 4511; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP34]], i32 [[TMP38]], i32 [[TMP37]] 4512; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i32> undef, i32 [[TMP39]], i64 0 4513; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[X]], i64 1 4514; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4515; CHECK-NEXT: [[TMP43:%.*]] = uitofp i32 [[TMP42]] to float 4516; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP43]]) 4517; CHECK-NEXT: [[TMP45:%.*]] = fmul fast float [[TMP44]], 0x41F0000000000000 4518; CHECK-NEXT: [[TMP46:%.*]] = fptoui float [[TMP45]] to i32 4519; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP46]] to i64 4520; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP42]] to i64 4521; CHECK-NEXT: [[TMP49:%.*]] = mul i64 [[TMP47]], [[TMP48]] 4522; CHECK-NEXT: [[TMP50:%.*]] = trunc i64 [[TMP49]] to i32 4523; CHECK-NEXT: [[TMP51:%.*]] = lshr i64 [[TMP49]], 32 4524; CHECK-NEXT: [[TMP52:%.*]] = trunc i64 [[TMP51]] to i32 4525; CHECK-NEXT: [[TMP53:%.*]] = sub i32 0, [[TMP50]] 4526; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0 4527; CHECK-NEXT: [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[TMP53]], i32 [[TMP50]] 4528; CHECK-NEXT: [[TMP56:%.*]] = zext i32 [[TMP55]] to i64 4529; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP46]] to i64 4530; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP56]], [[TMP57]] 4531; CHECK-NEXT: [[TMP59:%.*]] = trunc i64 [[TMP58]] to i32 4532; CHECK-NEXT: [[TMP60:%.*]] = lshr i64 [[TMP58]], 32 4533; CHECK-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i32 4534; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP46]], [[TMP61]] 4535; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP46]], [[TMP61]] 4536; CHECK-NEXT: [[TMP64:%.*]] = select i1 [[TMP54]], i32 [[TMP62]], i32 [[TMP63]] 4537; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP64]] to i64 4538; CHECK-NEXT: [[TMP66:%.*]] = zext i32 [[TMP41]] to i64 4539; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP65]], [[TMP66]] 4540; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 4541; CHECK-NEXT: [[TMP69:%.*]] = lshr i64 [[TMP67]], 32 4542; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i32 4543; CHECK-NEXT: [[TMP71:%.*]] = mul i32 [[TMP70]], [[TMP42]] 4544; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP41]], [[TMP71]] 4545; CHECK-NEXT: [[TMP73:%.*]] = icmp uge i32 [[TMP72]], [[TMP42]] 4546; CHECK-NEXT: [[TMP74:%.*]] = icmp uge i32 [[TMP41]], [[TMP71]] 4547; CHECK-NEXT: [[TMP75:%.*]] = and i1 [[TMP73]], [[TMP74]] 4548; CHECK-NEXT: [[TMP76:%.*]] = sub i32 [[TMP72]], [[TMP42]] 4549; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP42]] 4550; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP75]], i32 [[TMP76]], i32 [[TMP72]] 4551; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP74]], i32 [[TMP78]], i32 [[TMP77]] 4552; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP40]], i32 [[TMP79]], i64 1 4553; CHECK-NEXT: store <2 x i32> [[TMP80]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4554; CHECK-NEXT: ret void 4555; 4556; GCN-LABEL: urem_v2i32_pow2_shl_denom: 4557; GCN: ; %bb.0: 4558; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4559; GCN-NEXT: s_movk_i32 s4, 0x1000 4560; GCN-NEXT: s_mov_b32 s7, 0xf000 4561; GCN-NEXT: s_mov_b32 s6, -1 4562; GCN-NEXT: s_waitcnt lgkmcnt(0) 4563; GCN-NEXT: s_lshl_b32 s10, s4, s2 4564; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 4565; GCN-NEXT: s_mov_b32 s2, 0x4f800000 4566; GCN-NEXT: s_lshl_b32 s11, s4, s3 4567; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 4568; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4569; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4570; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4571; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4572; GCN-NEXT: v_mul_f32_e32 v0, s2, v0 4573; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4574; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 4575; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4576; GCN-NEXT: v_mul_lo_u32 v2, v0, s10 4577; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 4578; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 4579; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 4580; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 4581; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 4582; GCN-NEXT: v_mul_lo_u32 v3, v1, s11 4583; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 4584; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 4585; GCN-NEXT: v_mul_hi_u32 v2, v1, s11 4586; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 4587; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 4588; GCN-NEXT: s_waitcnt lgkmcnt(0) 4589; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 4590; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 4591; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 4592; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 4593; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 4594; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v1 4595; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 4596; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] 4597; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 4598; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v0 4599; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s8, v0 4600; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v3 4601; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 4602; GCN-NEXT: v_add_i32_e32 v4, vcc, s10, v3 4603; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v3 4604; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 4605; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 4606; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 4607; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] 4608; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v1 4609; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 4610; GCN-NEXT: v_add_i32_e32 v3, vcc, s11, v2 4611; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s11, v2 4612; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 4613; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 4614; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] 4615; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4616; GCN-NEXT: s_endpgm 4617 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4618 %r = urem <2 x i32> %x, %shl.y 4619 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4620 ret void 4621} 4622 4623define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4624; CHECK-LABEL: @sdiv_i32_oddk_denom( 4625; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 4626; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4627; CHECK-NEXT: ret void 4628; 4629; GCN-LABEL: sdiv_i32_oddk_denom: 4630; GCN: ; %bb.0: 4631; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4632; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4633; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4634; GCN-NEXT: s_mov_b32 s7, 0xf000 4635; GCN-NEXT: s_mov_b32 s6, -1 4636; GCN-NEXT: s_waitcnt lgkmcnt(0) 4637; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4638; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4639; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4640; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4641; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4642; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4643; GCN-NEXT: s_endpgm 4644 %r = sdiv i32 %x, 1235195 4645 store i32 %r, i32 addrspace(1)* %out 4646 ret void 4647} 4648 4649define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4650; CHECK-LABEL: @sdiv_i32_pow2k_denom( 4651; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 4652; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4653; CHECK-NEXT: ret void 4654; 4655; GCN-LABEL: sdiv_i32_pow2k_denom: 4656; GCN: ; %bb.0: 4657; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4658; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4659; GCN-NEXT: s_mov_b32 s7, 0xf000 4660; GCN-NEXT: s_mov_b32 s6, -1 4661; GCN-NEXT: s_waitcnt lgkmcnt(0) 4662; GCN-NEXT: s_ashr_i32 s1, s0, 31 4663; GCN-NEXT: s_lshr_b32 s1, s1, 20 4664; GCN-NEXT: s_add_i32 s0, s0, s1 4665; GCN-NEXT: s_ashr_i32 s0, s0, 12 4666; GCN-NEXT: v_mov_b32_e32 v0, s0 4667; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4668; GCN-NEXT: s_endpgm 4669 %r = sdiv i32 %x, 4096 4670 store i32 %r, i32 addrspace(1)* %out 4671 ret void 4672} 4673 4674define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4675; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 4676; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4677; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 4678; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4679; CHECK-NEXT: ret void 4680; 4681; GCN-LABEL: sdiv_i32_pow2_shl_denom: 4682; GCN: ; %bb.0: 4683; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4684; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4685; GCN-NEXT: s_mov_b32 s7, 0xf000 4686; GCN-NEXT: s_mov_b32 s6, -1 4687; GCN-NEXT: s_waitcnt lgkmcnt(0) 4688; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4689; GCN-NEXT: s_ashr_i32 s8, s3, 31 4690; GCN-NEXT: s_add_i32 s3, s3, s8 4691; GCN-NEXT: s_xor_b32 s9, s3, s8 4692; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 4693; GCN-NEXT: s_ashr_i32 s3, s2, 31 4694; GCN-NEXT: s_add_i32 s2, s2, s3 4695; GCN-NEXT: s_xor_b32 s2, s2, s3 4696; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4697; GCN-NEXT: s_xor_b32 s3, s3, s8 4698; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 4699; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4700; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 4701; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 4702; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 4703; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 4704; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4705; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 4706; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 4707; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 4708; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 4709; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 4710; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 4711; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4712; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 4713; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 4714; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 4715; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 4716; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] 4717; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4718; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] 4719; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 4720; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 4721; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4722; GCN-NEXT: s_endpgm 4723 %shl.y = shl i32 4096, %y 4724 %r = sdiv i32 %x, %shl.y 4725 store i32 %r, i32 addrspace(1)* %out 4726 ret void 4727} 4728 4729define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4730; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 4731; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4732; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4733; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4734; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4735; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 4736; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4737; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4738; CHECK-NEXT: ret void 4739; 4740; GCN-LABEL: sdiv_v2i32_pow2k_denom: 4741; GCN: ; %bb.0: 4742; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4743; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4744; GCN-NEXT: s_mov_b32 s7, 0xf000 4745; GCN-NEXT: s_mov_b32 s6, -1 4746; GCN-NEXT: s_waitcnt lgkmcnt(0) 4747; GCN-NEXT: s_ashr_i32 s2, s0, 31 4748; GCN-NEXT: s_lshr_b32 s2, s2, 20 4749; GCN-NEXT: s_ashr_i32 s3, s1, 31 4750; GCN-NEXT: s_add_i32 s0, s0, s2 4751; GCN-NEXT: s_lshr_b32 s2, s3, 20 4752; GCN-NEXT: s_add_i32 s1, s1, s2 4753; GCN-NEXT: s_ashr_i32 s0, s0, 12 4754; GCN-NEXT: s_ashr_i32 s1, s1, 12 4755; GCN-NEXT: v_mov_b32_e32 v0, s0 4756; GCN-NEXT: v_mov_b32_e32 v1, s1 4757; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4758; GCN-NEXT: s_endpgm 4759 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 4760 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4761 ret void 4762} 4763 4764define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4765; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 4766; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4767; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4768; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4769; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4770; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 4771; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4772; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4773; CHECK-NEXT: ret void 4774; 4775; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 4776; GCN: ; %bb.0: 4777; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4778; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4779; GCN-NEXT: v_mov_b32_e32 v0, 0x80080081 4780; GCN-NEXT: s_mov_b32 s7, 0xf000 4781; GCN-NEXT: s_mov_b32 s6, -1 4782; GCN-NEXT: s_waitcnt lgkmcnt(0) 4783; GCN-NEXT: v_mul_hi_i32 v0, s1, v0 4784; GCN-NEXT: s_ashr_i32 s2, s0, 31 4785; GCN-NEXT: s_lshr_b32 s2, s2, 20 4786; GCN-NEXT: s_add_i32 s0, s0, s2 4787; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0 4788; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4789; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 4790; GCN-NEXT: s_ashr_i32 s0, s0, 12 4791; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v0 4792; GCN-NEXT: v_mov_b32_e32 v0, s0 4793; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4794; GCN-NEXT: s_endpgm 4795 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 4796 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4797 ret void 4798} 4799 4800define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4801; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 4802; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4803; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4804; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4805; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4806; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4807; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4808; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 4809; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 4810; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 4811; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 4812; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 4813; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 4814; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41F0000000000000 4815; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 4816; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 4817; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64 4818; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 4819; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 4820; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 4821; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4822; CHECK-NEXT: [[TMP20:%.*]] = sub i32 0, [[TMP17]] 4823; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0 4824; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP17]] 4825; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 4826; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP13]] to i64 4827; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 4828; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 4829; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 4830; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4831; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP13]], [[TMP28]] 4832; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP13]], [[TMP28]] 4833; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP21]], i32 [[TMP29]], i32 [[TMP30]] 4834; CHECK-NEXT: [[TMP32:%.*]] = zext i32 [[TMP31]] to i64 4835; CHECK-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64 4836; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP32]], [[TMP33]] 4837; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32 4838; CHECK-NEXT: [[TMP36:%.*]] = lshr i64 [[TMP34]], 32 4839; CHECK-NEXT: [[TMP37:%.*]] = trunc i64 [[TMP36]] to i32 4840; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP9]] 4841; CHECK-NEXT: [[TMP39:%.*]] = sub i32 [[TMP8]], [[TMP38]] 4842; CHECK-NEXT: [[TMP40:%.*]] = icmp uge i32 [[TMP39]], [[TMP9]] 4843; CHECK-NEXT: [[TMP41:%.*]] = icmp uge i32 [[TMP8]], [[TMP38]] 4844; CHECK-NEXT: [[TMP42:%.*]] = and i1 [[TMP40]], [[TMP41]] 4845; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP37]], 1 4846; CHECK-NEXT: [[TMP44:%.*]] = sub i32 [[TMP37]], 1 4847; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP42]], i32 [[TMP43]], i32 [[TMP37]] 4848; CHECK-NEXT: [[TMP46:%.*]] = select i1 [[TMP41]], i32 [[TMP45]], i32 [[TMP44]] 4849; CHECK-NEXT: [[TMP47:%.*]] = xor i32 [[TMP46]], [[TMP5]] 4850; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP47]], [[TMP5]] 4851; CHECK-NEXT: [[TMP49:%.*]] = insertelement <2 x i32> undef, i32 [[TMP48]], i64 0 4852; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x i32> [[X]], i64 1 4853; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4854; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[TMP50]], 31 4855; CHECK-NEXT: [[TMP53:%.*]] = ashr i32 [[TMP51]], 31 4856; CHECK-NEXT: [[TMP54:%.*]] = xor i32 [[TMP52]], [[TMP53]] 4857; CHECK-NEXT: [[TMP55:%.*]] = add i32 [[TMP50]], [[TMP52]] 4858; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP51]], [[TMP53]] 4859; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP52]] 4860; CHECK-NEXT: [[TMP58:%.*]] = xor i32 [[TMP56]], [[TMP53]] 4861; CHECK-NEXT: [[TMP59:%.*]] = uitofp i32 [[TMP58]] to float 4862; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP59]]) 4863; CHECK-NEXT: [[TMP61:%.*]] = fmul fast float [[TMP60]], 0x41F0000000000000 4864; CHECK-NEXT: [[TMP62:%.*]] = fptoui float [[TMP61]] to i32 4865; CHECK-NEXT: [[TMP63:%.*]] = zext i32 [[TMP62]] to i64 4866; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP58]] to i64 4867; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP63]], [[TMP64]] 4868; CHECK-NEXT: [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32 4869; CHECK-NEXT: [[TMP67:%.*]] = lshr i64 [[TMP65]], 32 4870; CHECK-NEXT: [[TMP68:%.*]] = trunc i64 [[TMP67]] to i32 4871; CHECK-NEXT: [[TMP69:%.*]] = sub i32 0, [[TMP66]] 4872; CHECK-NEXT: [[TMP70:%.*]] = icmp eq i32 [[TMP68]], 0 4873; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP69]], i32 [[TMP66]] 4874; CHECK-NEXT: [[TMP72:%.*]] = zext i32 [[TMP71]] to i64 4875; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP62]] to i64 4876; CHECK-NEXT: [[TMP74:%.*]] = mul i64 [[TMP72]], [[TMP73]] 4877; CHECK-NEXT: [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32 4878; CHECK-NEXT: [[TMP76:%.*]] = lshr i64 [[TMP74]], 32 4879; CHECK-NEXT: [[TMP77:%.*]] = trunc i64 [[TMP76]] to i32 4880; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP62]], [[TMP77]] 4881; CHECK-NEXT: [[TMP79:%.*]] = sub i32 [[TMP62]], [[TMP77]] 4882; CHECK-NEXT: [[TMP80:%.*]] = select i1 [[TMP70]], i32 [[TMP78]], i32 [[TMP79]] 4883; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP80]] to i64 4884; CHECK-NEXT: [[TMP82:%.*]] = zext i32 [[TMP57]] to i64 4885; CHECK-NEXT: [[TMP83:%.*]] = mul i64 [[TMP81]], [[TMP82]] 4886; CHECK-NEXT: [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32 4887; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP83]], 32 4888; CHECK-NEXT: [[TMP86:%.*]] = trunc i64 [[TMP85]] to i32 4889; CHECK-NEXT: [[TMP87:%.*]] = mul i32 [[TMP86]], [[TMP58]] 4890; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP57]], [[TMP87]] 4891; CHECK-NEXT: [[TMP89:%.*]] = icmp uge i32 [[TMP88]], [[TMP58]] 4892; CHECK-NEXT: [[TMP90:%.*]] = icmp uge i32 [[TMP57]], [[TMP87]] 4893; CHECK-NEXT: [[TMP91:%.*]] = and i1 [[TMP89]], [[TMP90]] 4894; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP86]], 1 4895; CHECK-NEXT: [[TMP93:%.*]] = sub i32 [[TMP86]], 1 4896; CHECK-NEXT: [[TMP94:%.*]] = select i1 [[TMP91]], i32 [[TMP92]], i32 [[TMP86]] 4897; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP90]], i32 [[TMP94]], i32 [[TMP93]] 4898; CHECK-NEXT: [[TMP96:%.*]] = xor i32 [[TMP95]], [[TMP54]] 4899; CHECK-NEXT: [[TMP97:%.*]] = sub i32 [[TMP96]], [[TMP54]] 4900; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> [[TMP49]], i32 [[TMP97]], i64 1 4901; CHECK-NEXT: store <2 x i32> [[TMP98]], <2 x i32> addrspace(1)* [[OUT:%.*]] 4902; CHECK-NEXT: ret void 4903; 4904; GCN-LABEL: sdiv_v2i32_pow2_shl_denom: 4905; GCN: ; %bb.0: 4906; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4907; GCN-NEXT: s_movk_i32 s4, 0x1000 4908; GCN-NEXT: s_mov_b32 s14, 0x4f800000 4909; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 4910; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb 4911; GCN-NEXT: s_mov_b32 s11, 0xf000 4912; GCN-NEXT: s_waitcnt lgkmcnt(0) 4913; GCN-NEXT: s_lshl_b32 s2, s4, s2 4914; GCN-NEXT: s_ashr_i32 s5, s2, 31 4915; GCN-NEXT: s_add_i32 s2, s2, s5 4916; GCN-NEXT: s_xor_b32 s13, s2, s5 4917; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 4918; GCN-NEXT: s_ashr_i32 s2, s6, 31 4919; GCN-NEXT: s_lshl_b32 s0, s4, s3 4920; GCN-NEXT: s_add_i32 s1, s6, s2 4921; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4922; GCN-NEXT: s_ashr_i32 s6, s0, 31 4923; GCN-NEXT: s_add_i32 s4, s0, s6 4924; GCN-NEXT: s_xor_b32 s3, s1, s2 4925; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 4926; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4927; GCN-NEXT: s_xor_b32 s15, s4, s6 4928; GCN-NEXT: s_xor_b32 s12, s2, s5 4929; GCN-NEXT: s_mov_b32 s10, -1 4930; GCN-NEXT: v_mul_lo_u32 v1, v0, s13 4931; GCN-NEXT: v_mul_hi_u32 v2, v0, s13 4932; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 4933; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 4934; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4935; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 4936; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 4937; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 4938; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 4939; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4940; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 4941; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 4942; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 4943; GCN-NEXT: v_mul_lo_u32 v2, v0, s13 4944; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4945; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 4946; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 4947; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 4948; GCN-NEXT: v_mul_lo_u32 v4, v1, s15 4949; GCN-NEXT: v_mul_hi_u32 v5, v1, s15 4950; GCN-NEXT: s_ashr_i32 s13, s7, 31 4951; GCN-NEXT: s_add_i32 s7, s7, s13 4952; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 4953; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 4954; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] 4955; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 4956; GCN-NEXT: s_xor_b32 s7, s7, s13 4957; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s3, v2 4958; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4959; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 4960; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 4961; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] 4962; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 4963; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 4964; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4965; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] 4966; GCN-NEXT: v_mul_lo_u32 v2, v1, s15 4967; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 4968; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 4969; GCN-NEXT: s_xor_b32 s4, s13, s6 4970; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v2 4971; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 4972; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v2 4973; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v1 4974; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 4975; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 4976; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4977; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] 4978; GCN-NEXT: v_xor_b32_e32 v1, s4, v1 4979; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 4980; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 4981; GCN-NEXT: s_endpgm 4982 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4983 %r = sdiv <2 x i32> %x, %shl.y 4984 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4985 ret void 4986} 4987 4988define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4989; CHECK-LABEL: @srem_i32_oddk_denom( 4990; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 4991; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 4992; CHECK-NEXT: ret void 4993; 4994; GCN-LABEL: srem_i32_oddk_denom: 4995; GCN: ; %bb.0: 4996; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4997; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4998; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4999; GCN-NEXT: s_mov_b32 s7, 0xf000 5000; GCN-NEXT: s_mov_b32 s6, -1 5001; GCN-NEXT: s_waitcnt lgkmcnt(0) 5002; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 5003; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 5004; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 5005; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 5006; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 5007; GCN-NEXT: v_mul_i32_i24_e32 v0, 0x12d8fb, v0 5008; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 5009; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 5010; GCN-NEXT: s_endpgm 5011 %r = srem i32 %x, 1235195 5012 store i32 %r, i32 addrspace(1)* %out 5013 ret void 5014} 5015 5016define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5017; CHECK-LABEL: @srem_i32_pow2k_denom( 5018; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 5019; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 5020; CHECK-NEXT: ret void 5021; 5022; GCN-LABEL: srem_i32_pow2k_denom: 5023; GCN: ; %bb.0: 5024; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5025; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 5026; GCN-NEXT: s_mov_b32 s7, 0xf000 5027; GCN-NEXT: s_mov_b32 s6, -1 5028; GCN-NEXT: s_waitcnt lgkmcnt(0) 5029; GCN-NEXT: s_ashr_i32 s1, s0, 31 5030; GCN-NEXT: s_lshr_b32 s1, s1, 20 5031; GCN-NEXT: s_add_i32 s1, s0, s1 5032; GCN-NEXT: s_and_b32 s1, s1, 0xfffff000 5033; GCN-NEXT: s_sub_i32 s0, s0, s1 5034; GCN-NEXT: v_mov_b32_e32 v0, s0 5035; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 5036; GCN-NEXT: s_endpgm 5037 %r = srem i32 %x, 4096 5038 store i32 %r, i32 addrspace(1)* %out 5039 ret void 5040} 5041 5042define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5043; CHECK-LABEL: @srem_i32_pow2_shl_denom( 5044; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5045; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 5046; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]] 5047; CHECK-NEXT: ret void 5048; 5049; GCN-LABEL: srem_i32_pow2_shl_denom: 5050; GCN: ; %bb.0: 5051; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5052; GCN-NEXT: s_mov_b32 s7, 0xf000 5053; GCN-NEXT: s_mov_b32 s6, -1 5054; GCN-NEXT: s_waitcnt lgkmcnt(0) 5055; GCN-NEXT: s_lshl_b32 s2, 0x1000, s5 5056; GCN-NEXT: s_ashr_i32 s3, s2, 31 5057; GCN-NEXT: s_add_i32 s2, s2, s3 5058; GCN-NEXT: s_xor_b32 s10, s2, s3 5059; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 5060; GCN-NEXT: s_ashr_i32 s8, s4, 31 5061; GCN-NEXT: s_add_i32 s4, s4, s8 5062; GCN-NEXT: s_xor_b32 s9, s4, s8 5063; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 5064; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5065; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 5066; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5067; GCN-NEXT: v_mul_lo_u32 v1, v0, s10 5068; GCN-NEXT: v_mul_hi_u32 v2, v0, s10 5069; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 5070; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 5071; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 5072; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 5073; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 5074; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 5075; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 5076; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 5077; GCN-NEXT: v_mul_lo_u32 v0, v0, s10 5078; GCN-NEXT: v_sub_i32_e32 v1, vcc, s9, v0 5079; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s9, v0 5080; GCN-NEXT: v_add_i32_e32 v2, vcc, s10, v1 5081; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v1 5082; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v1 5083; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 5084; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5085; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] 5086; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 5087; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 5088; GCN-NEXT: s_waitcnt lgkmcnt(0) 5089; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 5090; GCN-NEXT: s_endpgm 5091 %shl.y = shl i32 4096, %y 5092 %r = srem i32 %x, %shl.y 5093 store i32 %r, i32 addrspace(1)* %out 5094 ret void 5095} 5096 5097define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5098; CHECK-LABEL: @srem_v2i32_pow2k_denom( 5099; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5100; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 5101; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5102; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5103; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 5104; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5105; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]] 5106; CHECK-NEXT: ret void 5107; 5108; GCN-LABEL: srem_v2i32_pow2k_denom: 5109; GCN: ; %bb.0: 5110; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5111; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5112; GCN-NEXT: s_movk_i32 s2, 0xf000 5113; GCN-NEXT: s_mov_b32 s7, 0xf000 5114; GCN-NEXT: s_mov_b32 s6, -1 5115; GCN-NEXT: s_waitcnt lgkmcnt(0) 5116; GCN-NEXT: s_ashr_i32 s3, s0, 31 5117; GCN-NEXT: s_lshr_b32 s3, s3, 20 5118; GCN-NEXT: s_add_i32 s3, s0, s3 5119; GCN-NEXT: s_and_b32 s3, s3, s2 5120; GCN-NEXT: s_sub_i32 s0, s0, s3 5121; GCN-NEXT: s_ashr_i32 s3, s1, 31 5122; GCN-NEXT: s_lshr_b32 s3, s3, 20 5123; GCN-NEXT: s_add_i32 s3, s1, s3 5124; GCN-NEXT: s_and_b32 s2, s3, s2 5125; GCN-NEXT: s_sub_i32 s1, s1, s2 5126; GCN-NEXT: v_mov_b32_e32 v0, s0 5127; GCN-NEXT: v_mov_b32_e32 v1, s1 5128; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5129; GCN-NEXT: s_endpgm 5130 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 5131 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5132 ret void 5133} 5134 5135define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5136; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 5137; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5138; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5139; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5140; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 5141; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 5142; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 5143; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 5144; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 5145; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 5146; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 5147; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5148; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41F0000000000000 5149; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 5150; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 5151; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64 5152; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 5153; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 5154; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 5155; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 5156; CHECK-NEXT: [[TMP19:%.*]] = sub i32 0, [[TMP16]] 5157; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0 5158; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP16]] 5159; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 5160; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 5161; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 5162; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 5163; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 5164; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 5165; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP12]], [[TMP27]] 5166; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP12]], [[TMP27]] 5167; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP20]], i32 [[TMP28]], i32 [[TMP29]] 5168; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP30]] to i64 5169; CHECK-NEXT: [[TMP32:%.*]] = zext i32 [[TMP7]] to i64 5170; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP31]], [[TMP32]] 5171; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[TMP33]] to i32 5172; CHECK-NEXT: [[TMP35:%.*]] = lshr i64 [[TMP33]], 32 5173; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 5174; CHECK-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], [[TMP8]] 5175; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP7]], [[TMP37]] 5176; CHECK-NEXT: [[TMP39:%.*]] = icmp uge i32 [[TMP38]], [[TMP8]] 5177; CHECK-NEXT: [[TMP40:%.*]] = icmp uge i32 [[TMP7]], [[TMP37]] 5178; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP39]], [[TMP40]] 5179; CHECK-NEXT: [[TMP42:%.*]] = sub i32 [[TMP38]], [[TMP8]] 5180; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP38]], [[TMP8]] 5181; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP41]], i32 [[TMP42]], i32 [[TMP38]] 5182; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP40]], i32 [[TMP44]], i32 [[TMP43]] 5183; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP45]], [[TMP3]] 5184; CHECK-NEXT: [[TMP47:%.*]] = sub i32 [[TMP46]], [[TMP3]] 5185; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x i32> undef, i32 [[TMP47]], i64 0 5186; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[X]], i64 1 5187; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5188; CHECK-NEXT: [[TMP51:%.*]] = ashr i32 [[TMP49]], 31 5189; CHECK-NEXT: [[TMP52:%.*]] = ashr i32 [[TMP50]], 31 5190; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[TMP49]], [[TMP51]] 5191; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP50]], [[TMP52]] 5192; CHECK-NEXT: [[TMP55:%.*]] = xor i32 [[TMP53]], [[TMP51]] 5193; CHECK-NEXT: [[TMP56:%.*]] = xor i32 [[TMP54]], [[TMP52]] 5194; CHECK-NEXT: [[TMP57:%.*]] = uitofp i32 [[TMP56]] to float 5195; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 5196; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP58]], 0x41F0000000000000 5197; CHECK-NEXT: [[TMP60:%.*]] = fptoui float [[TMP59]] to i32 5198; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP60]] to i64 5199; CHECK-NEXT: [[TMP62:%.*]] = zext i32 [[TMP56]] to i64 5200; CHECK-NEXT: [[TMP63:%.*]] = mul i64 [[TMP61]], [[TMP62]] 5201; CHECK-NEXT: [[TMP64:%.*]] = trunc i64 [[TMP63]] to i32 5202; CHECK-NEXT: [[TMP65:%.*]] = lshr i64 [[TMP63]], 32 5203; CHECK-NEXT: [[TMP66:%.*]] = trunc i64 [[TMP65]] to i32 5204; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP64]] 5205; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i32 [[TMP66]], 0 5206; CHECK-NEXT: [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP67]], i32 [[TMP64]] 5207; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i64 5208; CHECK-NEXT: [[TMP71:%.*]] = zext i32 [[TMP60]] to i64 5209; CHECK-NEXT: [[TMP72:%.*]] = mul i64 [[TMP70]], [[TMP71]] 5210; CHECK-NEXT: [[TMP73:%.*]] = trunc i64 [[TMP72]] to i32 5211; CHECK-NEXT: [[TMP74:%.*]] = lshr i64 [[TMP72]], 32 5212; CHECK-NEXT: [[TMP75:%.*]] = trunc i64 [[TMP74]] to i32 5213; CHECK-NEXT: [[TMP76:%.*]] = add i32 [[TMP60]], [[TMP75]] 5214; CHECK-NEXT: [[TMP77:%.*]] = sub i32 [[TMP60]], [[TMP75]] 5215; CHECK-NEXT: [[TMP78:%.*]] = select i1 [[TMP68]], i32 [[TMP76]], i32 [[TMP77]] 5216; CHECK-NEXT: [[TMP79:%.*]] = zext i32 [[TMP78]] to i64 5217; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP55]] to i64 5218; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP79]], [[TMP80]] 5219; CHECK-NEXT: [[TMP82:%.*]] = trunc i64 [[TMP81]] to i32 5220; CHECK-NEXT: [[TMP83:%.*]] = lshr i64 [[TMP81]], 32 5221; CHECK-NEXT: [[TMP84:%.*]] = trunc i64 [[TMP83]] to i32 5222; CHECK-NEXT: [[TMP85:%.*]] = mul i32 [[TMP84]], [[TMP56]] 5223; CHECK-NEXT: [[TMP86:%.*]] = sub i32 [[TMP55]], [[TMP85]] 5224; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP56]] 5225; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP55]], [[TMP85]] 5226; CHECK-NEXT: [[TMP89:%.*]] = and i1 [[TMP87]], [[TMP88]] 5227; CHECK-NEXT: [[TMP90:%.*]] = sub i32 [[TMP86]], [[TMP56]] 5228; CHECK-NEXT: [[TMP91:%.*]] = add i32 [[TMP86]], [[TMP56]] 5229; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP89]], i32 [[TMP90]], i32 [[TMP86]] 5230; CHECK-NEXT: [[TMP93:%.*]] = select i1 [[TMP88]], i32 [[TMP92]], i32 [[TMP91]] 5231; CHECK-NEXT: [[TMP94:%.*]] = xor i32 [[TMP93]], [[TMP51]] 5232; CHECK-NEXT: [[TMP95:%.*]] = sub i32 [[TMP94]], [[TMP51]] 5233; CHECK-NEXT: [[TMP96:%.*]] = insertelement <2 x i32> [[TMP48]], i32 [[TMP95]], i64 1 5234; CHECK-NEXT: store <2 x i32> [[TMP96]], <2 x i32> addrspace(1)* [[OUT:%.*]] 5235; CHECK-NEXT: ret void 5236; 5237; GCN-LABEL: srem_v2i32_pow2_shl_denom: 5238; GCN: ; %bb.0: 5239; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5240; GCN-NEXT: s_movk_i32 s4, 0x1000 5241; GCN-NEXT: s_mov_b32 s14, 0x4f800000 5242; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb 5243; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 5244; GCN-NEXT: s_waitcnt lgkmcnt(0) 5245; GCN-NEXT: s_lshl_b32 s2, s4, s2 5246; GCN-NEXT: s_ashr_i32 s5, s2, 31 5247; GCN-NEXT: s_add_i32 s2, s2, s5 5248; GCN-NEXT: s_xor_b32 s13, s2, s5 5249; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 5250; GCN-NEXT: s_lshl_b32 s2, s4, s3 5251; GCN-NEXT: s_ashr_i32 s12, s6, 31 5252; GCN-NEXT: s_add_i32 s3, s6, s12 5253; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 5254; GCN-NEXT: s_ashr_i32 s4, s2, 31 5255; GCN-NEXT: s_add_i32 s6, s2, s4 5256; GCN-NEXT: s_xor_b32 s5, s3, s12 5257; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 5258; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5259; GCN-NEXT: s_xor_b32 s15, s6, s4 5260; GCN-NEXT: s_ashr_i32 s6, s7, 31 5261; GCN-NEXT: s_add_i32 s7, s7, s6 5262; GCN-NEXT: v_mul_lo_u32 v1, v0, s13 5263; GCN-NEXT: v_mul_hi_u32 v2, v0, s13 5264; GCN-NEXT: s_xor_b32 s7, s7, s6 5265; GCN-NEXT: s_mov_b32 s11, 0xf000 5266; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 5267; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 5268; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 5269; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 5270; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 5271; GCN-NEXT: s_mov_b32 s10, -1 5272; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 5273; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 5274; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 5275; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[2:3] 5276; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 5277; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 5278; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5279; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 5280; GCN-NEXT: v_mul_lo_u32 v4, v1, s15 5281; GCN-NEXT: v_mul_hi_u32 v5, v1, s15 5282; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v0 5283; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s5, v0 5284; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 5285; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 5286; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] 5287; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 5288; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 5289; GCN-NEXT: v_add_i32_e32 v3, vcc, s13, v2 5290; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s13, v2 5291; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 5292; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 5293; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] 5294; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 5295; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 5296; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 5297; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] 5298; GCN-NEXT: v_mul_lo_u32 v1, v1, s15 5299; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 5300; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 5301; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 5302; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v1 5303; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v2 5304; GCN-NEXT: v_add_i32_e32 v3, vcc, s15, v2 5305; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s15, v2 5306; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 5307; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5308; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] 5309; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 5310; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 5311; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5312; GCN-NEXT: s_endpgm 5313 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5314 %r = srem <2 x i32> %x, %shl.y 5315 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5316 ret void 5317} 5318 5319define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5320; CHECK-LABEL: @udiv_i64_oddk_denom( 5321; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 5322; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5323; CHECK-NEXT: ret void 5324; 5325; GCN-LABEL: udiv_i64_oddk_denom: 5326; GCN: ; %bb.0: 5327; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 5328; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5329; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 5330; GCN-NEXT: v_rcp_f32_e32 v0, v0 5331; GCN-NEXT: s_movk_i32 s2, 0xfee0 5332; GCN-NEXT: s_mov_b32 s3, 0x68958c89 5333; GCN-NEXT: v_mov_b32_e32 v8, 0 5334; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5335; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5336; GCN-NEXT: v_trunc_f32_e32 v1, v1 5337; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5338; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5339; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5340; GCN-NEXT: v_mov_b32_e32 v7, 0 5341; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5342; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 5343; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 5344; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 5345; GCN-NEXT: s_mov_b32 s11, 0xf000 5346; GCN-NEXT: s_waitcnt lgkmcnt(0) 5347; GCN-NEXT: s_mov_b32 s8, s4 5348; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5349; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 5350; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5351; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5352; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 5353; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 5354; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5355; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5356; GCN-NEXT: s_movk_i32 s4, 0x11e 5357; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5358; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5359; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5360; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5361; GCN-NEXT: s_mov_b32 s10, -1 5362; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5363; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 5364; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5365; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5366; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5367; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5368; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5369; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 5370; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5371; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 5372; GCN-NEXT: s_mov_b32 s2, 0x976a7377 5373; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5374; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 5375; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5376; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 5377; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 5378; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 5379; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 5380; GCN-NEXT: s_movk_i32 s3, 0x11f 5381; GCN-NEXT: s_mov_b32 s9, s5 5382; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5383; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 5384; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 5385; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 5386; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5387; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5388; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 5389; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 5390; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5391; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5392; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5393; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5394; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5395; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5396; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 5397; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 5398; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 5399; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 5400; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 5401; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5402; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5403; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5404; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5405; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5406; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5407; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5408; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5409; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5410; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 5411; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 5412; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 5413; GCN-NEXT: v_mov_b32_e32 v5, s3 5414; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5415; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 5416; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5417; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 5418; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 5419; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 5420; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 5421; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 5422; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 5423; GCN-NEXT: s_mov_b32 s2, 0x976a7376 5424; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 5425; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 5426; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 5427; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 5428; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 5429; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 5430; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 5431; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 5432; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 5433; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 5434; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 5435; GCN-NEXT: v_mov_b32_e32 v6, s7 5436; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 5437; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 5438; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5439; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 5440; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5441; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 5442; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5443; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5444; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5445; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5446; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5447; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5448; GCN-NEXT: s_endpgm 5449 %r = udiv i64 %x, 1235195949943 5450 store i64 %r, i64 addrspace(1)* %out 5451 ret void 5452} 5453 5454define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5455; CHECK-LABEL: @udiv_i64_pow2k_denom( 5456; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 5457; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5458; CHECK-NEXT: ret void 5459; 5460; GCN-LABEL: udiv_i64_pow2k_denom: 5461; GCN: ; %bb.0: 5462; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5463; GCN-NEXT: s_mov_b32 s3, 0xf000 5464; GCN-NEXT: s_mov_b32 s2, -1 5465; GCN-NEXT: s_waitcnt lgkmcnt(0) 5466; GCN-NEXT: s_mov_b32 s0, s4 5467; GCN-NEXT: s_mov_b32 s1, s5 5468; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 5469; GCN-NEXT: v_mov_b32_e32 v0, s4 5470; GCN-NEXT: v_mov_b32_e32 v1, s5 5471; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5472; GCN-NEXT: s_endpgm 5473 %r = udiv i64 %x, 4096 5474 store i64 %r, i64 addrspace(1)* %out 5475 ret void 5476} 5477 5478define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5479; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 5480; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5481; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 5482; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5483; CHECK-NEXT: ret void 5484; 5485; GCN-LABEL: udiv_i64_pow2_shl_denom: 5486; GCN: ; %bb.0: 5487; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5488; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5489; GCN-NEXT: s_mov_b32 s3, 0xf000 5490; GCN-NEXT: s_mov_b32 s2, -1 5491; GCN-NEXT: s_waitcnt lgkmcnt(0) 5492; GCN-NEXT: s_mov_b32 s0, s4 5493; GCN-NEXT: s_add_i32 s8, s8, 12 5494; GCN-NEXT: s_mov_b32 s1, s5 5495; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 5496; GCN-NEXT: v_mov_b32_e32 v0, s4 5497; GCN-NEXT: v_mov_b32_e32 v1, s5 5498; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5499; GCN-NEXT: s_endpgm 5500 %shl.y = shl i64 4096, %y 5501 %r = udiv i64 %x, %shl.y 5502 store i64 %r, i64 addrspace(1)* %out 5503 ret void 5504} 5505 5506define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5507; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 5508; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5509; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5510; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5511; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5512; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 5513; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5514; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 5515; CHECK-NEXT: ret void 5516; 5517; GCN-LABEL: udiv_v2i64_pow2k_denom: 5518; GCN: ; %bb.0: 5519; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5520; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5521; GCN-NEXT: s_mov_b32 s7, 0xf000 5522; GCN-NEXT: s_mov_b32 s6, -1 5523; GCN-NEXT: s_waitcnt lgkmcnt(0) 5524; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 5525; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 5526; GCN-NEXT: v_mov_b32_e32 v0, s0 5527; GCN-NEXT: v_mov_b32_e32 v1, s1 5528; GCN-NEXT: v_mov_b32_e32 v2, s2 5529; GCN-NEXT: v_mov_b32_e32 v3, s3 5530; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5531; GCN-NEXT: s_endpgm 5532 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 5533 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5534 ret void 5535} 5536 5537define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5538; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 5539; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5540; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5541; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5542; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5543; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 5544; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5545; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 5546; CHECK-NEXT: ret void 5547; 5548; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: 5549; GCN: ; %bb.0: 5550; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5551; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 5552; GCN-NEXT: v_rcp_f32_e32 v0, v0 5553; GCN-NEXT: s_movk_i32 s6, 0xf001 5554; GCN-NEXT: v_mov_b32_e32 v7, 0 5555; GCN-NEXT: v_mov_b32_e32 v2, 0 5556; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5557; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5558; GCN-NEXT: v_trunc_f32_e32 v1, v1 5559; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5560; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5561; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5562; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5563; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5564; GCN-NEXT: s_movk_i32 s0, 0xfff 5565; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 5566; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 5567; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 5568; GCN-NEXT: s_mov_b32 s7, 0xf000 5569; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 5570; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5571; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5572; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 5573; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 5574; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 5575; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 5576; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5577; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc 5578; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 5579; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5580; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5581; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 5582; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc 5583; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5584; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 5585; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5586; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 5587; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] 5588; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 5589; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 5590; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5591; GCN-NEXT: s_mov_b32 s6, -1 5592; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 5593; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 5594; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 5595; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 5596; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 5597; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5598; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc 5599; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 5600; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 5601; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 5602; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5603; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc 5604; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc 5605; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5606; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 5607; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 5608; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 5609; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 5610; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5611; GCN-NEXT: s_waitcnt lgkmcnt(0) 5612; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 5613; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 5614; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5615; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 5616; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5617; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5618; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5619; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5620; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5621; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 5622; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5623; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc 5624; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc 5625; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5626; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc 5627; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 5628; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 5629; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 5630; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5631; GCN-NEXT: v_mov_b32_e32 v3, s11 5632; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 5633; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5634; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 5635; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5636; GCN-NEXT: s_movk_i32 s0, 0xffe 5637; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 5638; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5639; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5640; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5641; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5642; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5643; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5644; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 5645; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5646; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 5647; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 5648; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 5649; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 5650; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 5651; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 5652; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 5653; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc 5654; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 5655; GCN-NEXT: v_mov_b32_e32 v0, s2 5656; GCN-NEXT: v_mov_b32_e32 v1, s3 5657; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5658; GCN-NEXT: s_endpgm 5659 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 5660 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5661 ret void 5662} 5663 5664define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5665; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 5666; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5667; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5668; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5669; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 5670; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5671; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5672; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5673; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 5674; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5675; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] 5676; CHECK-NEXT: ret void 5677; 5678; GCN-LABEL: udiv_v2i64_pow2_shl_denom: 5679; GCN: ; %bb.0: 5680; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5681; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5682; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5683; GCN-NEXT: s_mov_b32 s7, 0xf000 5684; GCN-NEXT: s_mov_b32 s6, -1 5685; GCN-NEXT: s_waitcnt lgkmcnt(0) 5686; GCN-NEXT: s_add_i32 s0, s0, 12 5687; GCN-NEXT: s_add_i32 s2, s2, 12 5688; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 5689; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 5690; GCN-NEXT: v_mov_b32_e32 v0, s0 5691; GCN-NEXT: v_mov_b32_e32 v1, s1 5692; GCN-NEXT: v_mov_b32_e32 v2, s2 5693; GCN-NEXT: v_mov_b32_e32 v3, s3 5694; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5695; GCN-NEXT: s_endpgm 5696 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5697 %r = udiv <2 x i64> %x, %shl.y 5698 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5699 ret void 5700} 5701 5702define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5703; CHECK-LABEL: @urem_i64_oddk_denom( 5704; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 5705; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5706; CHECK-NEXT: ret void 5707; 5708; GCN-LABEL: urem_i64_oddk_denom: 5709; GCN: ; %bb.0: 5710; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 5711; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5712; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 5713; GCN-NEXT: v_rcp_f32_e32 v0, v0 5714; GCN-NEXT: s_movk_i32 s2, 0xfee0 5715; GCN-NEXT: s_mov_b32 s3, 0x689e0837 5716; GCN-NEXT: v_mov_b32_e32 v8, 0 5717; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5718; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5719; GCN-NEXT: v_trunc_f32_e32 v1, v1 5720; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5721; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5722; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5723; GCN-NEXT: v_mov_b32_e32 v7, 0 5724; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5725; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 5726; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 5727; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 5728; GCN-NEXT: s_movk_i32 s12, 0x11f 5729; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 5730; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5731; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 5732; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5733; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5734; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 5735; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 5736; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5737; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5738; GCN-NEXT: s_waitcnt lgkmcnt(0) 5739; GCN-NEXT: s_mov_b32 s9, s5 5740; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5741; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5742; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5743; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5744; GCN-NEXT: s_movk_i32 s5, 0x11e 5745; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5746; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 5747; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5748; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5749; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5750; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5751; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5752; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 5753; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5754; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 5755; GCN-NEXT: s_mov_b32 s8, s4 5756; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5757; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 5758; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5759; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 5760; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 5761; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 5762; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 5763; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 5764; GCN-NEXT: s_mov_b32 s11, 0xf000 5765; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5766; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 5767; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 5768; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 5769; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5770; GCN-NEXT: s_mov_b32 s10, -1 5771; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5772; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 5773; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 5774; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5775; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5776; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5777; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5778; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5779; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5780; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 5781; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 5782; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 5783; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 5784; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 5785; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5786; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5787; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5788; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5789; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5790; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5791; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5792; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5793; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5794; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 5795; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 5796; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 5797; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 5798; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5799; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 5800; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 5801; GCN-NEXT: v_mov_b32_e32 v3, s12 5802; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 5803; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 5804; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 5805; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 5806; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 5807; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 5808; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 5809; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 5810; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 5811; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 5812; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 5813; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 5814; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 5815; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 5816; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 5817; GCN-NEXT: v_mov_b32_e32 v5, s7 5818; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 5819; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 5820; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5821; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 5822; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5823; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 5824; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 5825; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 5826; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5827; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 5828; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5829; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5830; GCN-NEXT: s_endpgm 5831 %r = urem i64 %x, 1235195393993 5832 store i64 %r, i64 addrspace(1)* %out 5833 ret void 5834} 5835 5836define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5837; CHECK-LABEL: @urem_i64_pow2k_denom( 5838; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 5839; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5840; CHECK-NEXT: ret void 5841; 5842; GCN-LABEL: urem_i64_pow2k_denom: 5843; GCN: ; %bb.0: 5844; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5845; GCN-NEXT: s_mov_b32 s3, 0xf000 5846; GCN-NEXT: s_mov_b32 s2, -1 5847; GCN-NEXT: v_mov_b32_e32 v1, 0 5848; GCN-NEXT: s_waitcnt lgkmcnt(0) 5849; GCN-NEXT: s_mov_b32 s0, s4 5850; GCN-NEXT: s_and_b32 s4, s6, 0xfff 5851; GCN-NEXT: s_mov_b32 s1, s5 5852; GCN-NEXT: v_mov_b32_e32 v0, s4 5853; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5854; GCN-NEXT: s_endpgm 5855 %r = urem i64 %x, 4096 5856 store i64 %r, i64 addrspace(1)* %out 5857 ret void 5858} 5859 5860define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5861; CHECK-LABEL: @urem_i64_pow2_shl_denom( 5862; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5863; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 5864; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5865; CHECK-NEXT: ret void 5866; 5867; GCN-LABEL: urem_i64_pow2_shl_denom: 5868; GCN: ; %bb.0: 5869; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5870; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5871; GCN-NEXT: s_mov_b32 s3, 0xf000 5872; GCN-NEXT: s_mov_b32 s2, -1 5873; GCN-NEXT: s_waitcnt lgkmcnt(0) 5874; GCN-NEXT: s_mov_b32 s0, s4 5875; GCN-NEXT: s_mov_b32 s1, s5 5876; GCN-NEXT: s_mov_b32 s5, 0 5877; GCN-NEXT: s_movk_i32 s4, 0x1000 5878; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 5879; GCN-NEXT: s_add_u32 s4, s4, -1 5880; GCN-NEXT: s_addc_u32 s5, s5, -1 5881; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 5882; GCN-NEXT: v_mov_b32_e32 v0, s4 5883; GCN-NEXT: v_mov_b32_e32 v1, s5 5884; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5885; GCN-NEXT: s_endpgm 5886 %shl.y = shl i64 4096, %y 5887 %r = urem i64 %x, %shl.y 5888 store i64 %r, i64 addrspace(1)* %out 5889 ret void 5890} 5891 5892define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5893; CHECK-LABEL: @urem_v2i64_pow2k_denom( 5894; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5895; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 5896; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5897; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5898; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 5899; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5900; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 5901; CHECK-NEXT: ret void 5902; 5903; GCN-LABEL: urem_v2i64_pow2k_denom: 5904; GCN: ; %bb.0: 5905; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5906; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5907; GCN-NEXT: s_movk_i32 s8, 0xfff 5908; GCN-NEXT: v_mov_b32_e32 v1, 0 5909; GCN-NEXT: s_mov_b32 s7, 0xf000 5910; GCN-NEXT: s_mov_b32 s6, -1 5911; GCN-NEXT: s_waitcnt lgkmcnt(0) 5912; GCN-NEXT: s_and_b32 s0, s0, s8 5913; GCN-NEXT: s_and_b32 s1, s2, s8 5914; GCN-NEXT: v_mov_b32_e32 v0, s0 5915; GCN-NEXT: v_mov_b32_e32 v2, s1 5916; GCN-NEXT: v_mov_b32_e32 v3, v1 5917; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5918; GCN-NEXT: s_endpgm 5919 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 5920 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5921 ret void 5922} 5923 5924define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5925; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 5926; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5927; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5928; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5929; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 5930; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5931; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5932; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5933; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 5934; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5935; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] 5936; CHECK-NEXT: ret void 5937; 5938; GCN-LABEL: urem_v2i64_pow2_shl_denom: 5939; GCN: ; %bb.0: 5940; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5941; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5942; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5943; GCN-NEXT: s_mov_b32 s13, 0 5944; GCN-NEXT: s_movk_i32 s12, 0x1000 5945; GCN-NEXT: s_mov_b32 s7, 0xf000 5946; GCN-NEXT: s_mov_b32 s6, -1 5947; GCN-NEXT: s_waitcnt lgkmcnt(0) 5948; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 5949; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 5950; GCN-NEXT: s_add_u32 s0, s0, -1 5951; GCN-NEXT: s_addc_u32 s1, s1, -1 5952; GCN-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 5953; GCN-NEXT: s_add_u32 s2, s2, -1 5954; GCN-NEXT: s_addc_u32 s3, s3, -1 5955; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 5956; GCN-NEXT: v_mov_b32_e32 v0, s0 5957; GCN-NEXT: v_mov_b32_e32 v1, s1 5958; GCN-NEXT: v_mov_b32_e32 v2, s2 5959; GCN-NEXT: v_mov_b32_e32 v3, s3 5960; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5961; GCN-NEXT: s_endpgm 5962 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5963 %r = urem <2 x i64> %x, %shl.y 5964 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5965 ret void 5966} 5967 5968define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5969; CHECK-LABEL: @sdiv_i64_oddk_denom( 5970; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 5971; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 5972; CHECK-NEXT: ret void 5973; 5974; GCN-LABEL: sdiv_i64_oddk_denom: 5975; GCN: ; %bb.0: 5976; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5977; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 5978; GCN-NEXT: v_rcp_f32_e32 v0, v0 5979; GCN-NEXT: s_mov_b32 s2, 0xffed2705 5980; GCN-NEXT: v_mov_b32_e32 v8, 0 5981; GCN-NEXT: v_mov_b32_e32 v7, 0 5982; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5983; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5984; GCN-NEXT: v_trunc_f32_e32 v1, v1 5985; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5986; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5987; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5988; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5989; GCN-NEXT: s_mov_b32 s7, 0xf000 5990; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5991; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 5992; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5993; GCN-NEXT: s_mov_b32 s6, -1 5994; GCN-NEXT: s_waitcnt lgkmcnt(0) 5995; GCN-NEXT: s_mov_b32 s4, s8 5996; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5997; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5998; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5999; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 6000; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 6001; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 6002; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6003; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6004; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6005; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6006; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 6007; GCN-NEXT: s_mov_b32 s5, s9 6008; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6009; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 6010; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 6011; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6012; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6013; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6014; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6015; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 6016; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 6017; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6018; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 6019; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 6020; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 6021; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 6022; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 6023; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 6024; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 6025; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 6026; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6027; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 6028; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 6029; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 6030; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 6031; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 6032; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6033; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 6034; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6035; GCN-NEXT: s_ashr_i32 s2, s11, 31 6036; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 6037; GCN-NEXT: s_add_u32 s0, s10, s2 6038; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6039; GCN-NEXT: s_mov_b32 s3, s2 6040; GCN-NEXT: s_addc_u32 s1, s11, s2 6041; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 6042; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6043; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6044; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6045; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 6046; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 6047; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6048; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6049; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6050; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 6051; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6052; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 6053; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 6054; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6055; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 6056; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6057; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 6058; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 6059; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 6060; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 6061; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6062; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 6063; GCN-NEXT: v_mov_b32_e32 v3, s1 6064; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 6065; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 6066; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 6067; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 6068; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 6069; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6070; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 6071; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 6072; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 6073; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 6074; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 6075; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 6076; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6077; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 6078; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 6079; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 6080; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 6081; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 6082; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 6083; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6084; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 6085; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6086; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6087; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 6088; GCN-NEXT: v_mov_b32_e32 v2, s2 6089; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6090; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6091; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6092; GCN-NEXT: s_endpgm 6093 %r = sdiv i64 %x, 1235195 6094 store i64 %r, i64 addrspace(1)* %out 6095 ret void 6096} 6097 6098define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 6099; CHECK-LABEL: @sdiv_i64_pow2k_denom( 6100; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 6101; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 6102; CHECK-NEXT: ret void 6103; 6104; GCN-LABEL: sdiv_i64_pow2k_denom: 6105; GCN: ; %bb.0: 6106; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 6107; GCN-NEXT: s_mov_b32 s3, 0xf000 6108; GCN-NEXT: s_mov_b32 s2, -1 6109; GCN-NEXT: s_waitcnt lgkmcnt(0) 6110; GCN-NEXT: s_mov_b32 s0, s4 6111; GCN-NEXT: s_ashr_i32 s4, s7, 31 6112; GCN-NEXT: s_lshr_b32 s4, s4, 20 6113; GCN-NEXT: s_add_u32 s4, s6, s4 6114; GCN-NEXT: s_mov_b32 s1, s5 6115; GCN-NEXT: s_addc_u32 s5, s7, 0 6116; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 6117; GCN-NEXT: v_mov_b32_e32 v0, s4 6118; GCN-NEXT: v_mov_b32_e32 v1, s5 6119; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6120; GCN-NEXT: s_endpgm 6121 %r = sdiv i64 %x, 4096 6122 store i64 %r, i64 addrspace(1)* %out 6123 ret void 6124} 6125 6126define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 6127; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 6128; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 6129; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 6130; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 6131; CHECK-NEXT: ret void 6132; 6133; GCN-LABEL: sdiv_i64_pow2_shl_denom: 6134; GCN: ; %bb.0: 6135; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 6136; GCN-NEXT: s_mov_b32 s3, 0 6137; GCN-NEXT: s_movk_i32 s2, 0x1000 6138; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6139; GCN-NEXT: s_mov_b32 s7, 0xf000 6140; GCN-NEXT: s_waitcnt lgkmcnt(0) 6141; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6142; GCN-NEXT: s_ashr_i32 s12, s3, 31 6143; GCN-NEXT: s_add_u32 s2, s2, s12 6144; GCN-NEXT: s_mov_b32 s13, s12 6145; GCN-NEXT: s_addc_u32 s3, s3, s12 6146; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 6147; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 6148; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 6149; GCN-NEXT: s_sub_u32 s4, 0, s2 6150; GCN-NEXT: s_subb_u32 s5, 0, s3 6151; GCN-NEXT: s_ashr_i32 s14, s11, 31 6152; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 6153; GCN-NEXT: v_rcp_f32_e32 v0, v0 6154; GCN-NEXT: s_mov_b32 s15, s14 6155; GCN-NEXT: s_mov_b32 s6, -1 6156; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6157; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6158; GCN-NEXT: v_trunc_f32_e32 v1, v1 6159; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6160; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6161; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6162; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 6163; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 6164; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 6165; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 6166; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6167; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6168; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 6169; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6170; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6171; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6172; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6173; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6174; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6175; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6176; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6177; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6178; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 6179; GCN-NEXT: v_mov_b32_e32 v4, 0 6180; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6181; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6182; GCN-NEXT: v_mov_b32_e32 v6, 0 6183; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6184; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6185; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6186; GCN-NEXT: v_mul_lo_u32 v5, s4, v2 6187; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 6188; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 6189; GCN-NEXT: s_mov_b32 s5, s9 6190; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6191; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 6192; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6193; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6194; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6195; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6196; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6197; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6198; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6199; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6200; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6201; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6202; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6203; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6204; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6205; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6206; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6207; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6208; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6209; GCN-NEXT: s_add_u32 s0, s10, s14 6210; GCN-NEXT: s_addc_u32 s1, s11, s14 6211; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6212; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6213; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6214; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 6215; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 6216; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 6217; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 6218; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 6219; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6220; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6221; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 6222; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 6223; GCN-NEXT: s_mov_b32 s4, s8 6224; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6225; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6226; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6227; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6228; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6229; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 6230; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6231; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 6232; GCN-NEXT: v_mov_b32_e32 v5, s3 6233; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6234; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 6235; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6236; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 6237; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 6238; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 6239; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 6240; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 6241; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 6242; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 6243; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 6244; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 6245; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 6246; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 6247; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 6248; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 6249; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 6250; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 6251; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 6252; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 6253; GCN-NEXT: v_mov_b32_e32 v6, s11 6254; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 6255; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 6256; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6257; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 6258; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6259; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 6260; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 6261; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6262; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 6263; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6264; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 6265; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6266; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 6267; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 6268; GCN-NEXT: v_mov_b32_e32 v2, s1 6269; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 6270; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6271; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6272; GCN-NEXT: s_endpgm 6273 %shl.y = shl i64 4096, %y 6274 %r = sdiv i64 %x, %shl.y 6275 store i64 %r, i64 addrspace(1)* %out 6276 ret void 6277} 6278 6279define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 6280; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 6281; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6282; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 6283; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 6284; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 6285; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 6286; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 6287; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 6288; CHECK-NEXT: ret void 6289; 6290; GCN-LABEL: sdiv_v2i64_pow2k_denom: 6291; GCN: ; %bb.0: 6292; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6293; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 6294; GCN-NEXT: s_mov_b32 s7, 0xf000 6295; GCN-NEXT: s_mov_b32 s6, -1 6296; GCN-NEXT: s_waitcnt lgkmcnt(0) 6297; GCN-NEXT: s_ashr_i32 s8, s1, 31 6298; GCN-NEXT: s_lshr_b32 s8, s8, 20 6299; GCN-NEXT: s_add_u32 s0, s0, s8 6300; GCN-NEXT: s_addc_u32 s1, s1, 0 6301; GCN-NEXT: s_ashr_i32 s8, s3, 31 6302; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 6303; GCN-NEXT: s_lshr_b32 s8, s8, 20 6304; GCN-NEXT: s_add_u32 s2, s2, s8 6305; GCN-NEXT: s_addc_u32 s3, s3, 0 6306; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 6307; GCN-NEXT: v_mov_b32_e32 v0, s0 6308; GCN-NEXT: v_mov_b32_e32 v1, s1 6309; GCN-NEXT: v_mov_b32_e32 v2, s2 6310; GCN-NEXT: v_mov_b32_e32 v3, s3 6311; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6312; GCN-NEXT: s_endpgm 6313 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 6314 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6315 ret void 6316} 6317 6318define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 6319; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 6320; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6321; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 6322; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 6323; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 6324; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 6325; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 6326; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 6327; CHECK-NEXT: ret void 6328; 6329; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 6330; GCN: ; %bb.0: 6331; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 6332; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 6333; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 6334; GCN-NEXT: v_rcp_f32_e32 v0, v0 6335; GCN-NEXT: s_movk_i32 s6, 0xf001 6336; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6337; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6338; GCN-NEXT: s_mov_b32 s7, 0xf000 6339; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6340; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6341; GCN-NEXT: v_trunc_f32_e32 v1, v1 6342; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6343; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6344; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6345; GCN-NEXT: s_waitcnt lgkmcnt(0) 6346; GCN-NEXT: s_ashr_i32 s0, s9, 31 6347; GCN-NEXT: s_lshr_b32 s0, s0, 20 6348; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 6349; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 6350; GCN-NEXT: s_add_u32 s2, s8, s0 6351; GCN-NEXT: s_addc_u32 s3, s9, 0 6352; GCN-NEXT: s_ashr_i32 s8, s11, 31 6353; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6354; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 6355; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 6356; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 6357; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6358; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 6359; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6360; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6361; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 6362; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6363; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6364; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 6365; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 6366; GCN-NEXT: s_mov_b32 s9, s8 6367; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 6368; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 6369; GCN-NEXT: v_mov_b32_e32 v4, 0 6370; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6371; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6372; GCN-NEXT: v_mov_b32_e32 v6, 0 6373; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6374; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6375; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6376; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 6377; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6378; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6379; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 6380; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 6381; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6382; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6383; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6384; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6385; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6386; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6387; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6388; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6389; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6390; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6391; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6392; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6393; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6394; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6395; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6396; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6397; GCN-NEXT: s_add_u32 s0, s10, s8 6398; GCN-NEXT: s_addc_u32 s1, s11, s8 6399; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6400; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 6401; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6402; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6403; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6404; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 6405; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 6406; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6407; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6408; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6409; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 6410; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6411; GCN-NEXT: s_movk_i32 s9, 0xfff 6412; GCN-NEXT: s_mov_b32 s6, -1 6413; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6414; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6415; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6416; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6417; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6418; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 6419; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 6420; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 6421; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6422; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 6423; GCN-NEXT: v_mov_b32_e32 v3, s1 6424; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 6425; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 6426; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 6427; GCN-NEXT: s_movk_i32 s0, 0xffe 6428; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 6429; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6430; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 6431; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 6432; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 6433; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 6434; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 6435; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 6436; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6437; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 6438; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 6439; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 6440; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 6441; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 6442; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 6443; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6444; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 6445; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6446; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 6447; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6448; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 6449; GCN-NEXT: v_mov_b32_e32 v3, s8 6450; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 6451; GCN-NEXT: v_mov_b32_e32 v0, s2 6452; GCN-NEXT: v_mov_b32_e32 v1, s3 6453; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6454; GCN-NEXT: s_endpgm 6455 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 6456 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6457 ret void 6458} 6459 6460define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6461; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 6462; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6463; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6464; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6465; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 6466; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6467; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6468; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6469; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 6470; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6471; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] 6472; CHECK-NEXT: ret void 6473; 6474; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: 6475; GCN: ; %bb.0: 6476; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6477; GCN-NEXT: s_mov_b32 s3, 0 6478; GCN-NEXT: s_movk_i32 s2, 0x1000 6479; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6480; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6481; GCN-NEXT: s_waitcnt lgkmcnt(0) 6482; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 6483; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6484; GCN-NEXT: s_ashr_i32 s16, s3, 31 6485; GCN-NEXT: s_add_u32 s2, s2, s16 6486; GCN-NEXT: s_mov_b32 s17, s16 6487; GCN-NEXT: s_addc_u32 s3, s3, s16 6488; GCN-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 6489; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 6490; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 6491; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6492; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6493; GCN-NEXT: s_sub_u32 s6, 0, s14 6494; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6495; GCN-NEXT: v_rcp_f32_e32 v0, v0 6496; GCN-NEXT: s_subb_u32 s7, 0, s15 6497; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6498; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6499; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6500; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6501; GCN-NEXT: v_trunc_f32_e32 v1, v1 6502; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6503; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6504; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6505; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6506; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6507; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6508; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6509; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6510; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6511; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6512; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6513; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6514; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6515; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6516; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6517; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6518; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6519; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6520; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6521; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6522; GCN-NEXT: v_mov_b32_e32 v4, 0 6523; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6524; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6525; GCN-NEXT: v_mov_b32_e32 v6, 0 6526; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6527; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6528; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6529; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6530; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6531; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6532; GCN-NEXT: s_mov_b32 s7, 0xf000 6533; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6534; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6535; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6536; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6537; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6538; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6539; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6540; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6541; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6542; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6543; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6544; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6545; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6546; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6547; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6548; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6549; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6550; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6551; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6552; GCN-NEXT: s_waitcnt lgkmcnt(0) 6553; GCN-NEXT: s_ashr_i32 s2, s9, 31 6554; GCN-NEXT: s_add_u32 s0, s8, s2 6555; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6556; GCN-NEXT: s_mov_b32 s3, s2 6557; GCN-NEXT: s_addc_u32 s1, s9, s2 6558; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 6559; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6560; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6561; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6562; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6563; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6564; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6565; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6566; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6567; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6568; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6569; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] 6570; GCN-NEXT: s_mov_b32 s6, -1 6571; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6572; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6573; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6574; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6575; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6576; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 6577; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 6578; GCN-NEXT: v_mul_lo_u32 v5, s15, v0 6579; GCN-NEXT: v_mov_b32_e32 v7, s15 6580; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6581; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 6582; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6583; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 6584; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6585; GCN-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc 6586; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 6587; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 6588; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 6589; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6590; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 6591; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6592; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 6593; GCN-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 6594; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 6595; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 6596; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 6597; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] 6598; GCN-NEXT: s_ashr_i32 s8, s13, 31 6599; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6600; GCN-NEXT: s_add_u32 s12, s12, s8 6601; GCN-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 6602; GCN-NEXT: v_mov_b32_e32 v8, s9 6603; GCN-NEXT: s_mov_b32 s9, s8 6604; GCN-NEXT: s_addc_u32 s13, s13, s8 6605; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 6606; GCN-NEXT: v_cvt_f32_u32_e32 v10, s12 6607; GCN-NEXT: v_cvt_f32_u32_e32 v11, s13 6608; GCN-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 6609; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 6610; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6611; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 6612; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6613; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 6614; GCN-NEXT: v_mac_f32_e32 v10, s18, v11 6615; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 6616; GCN-NEXT: v_rcp_f32_e32 v3, v10 6617; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6618; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 6619; GCN-NEXT: s_sub_u32 s14, 0, s12 6620; GCN-NEXT: v_mul_f32_e32 v3, s19, v3 6621; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6622; GCN-NEXT: v_trunc_f32_e32 v5, v5 6623; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6624; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6625; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6626; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 6627; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6628; GCN-NEXT: v_mul_hi_u32 v2, s14, v3 6629; GCN-NEXT: v_mul_lo_u32 v7, s14, v5 6630; GCN-NEXT: s_subb_u32 s15, 0, s13 6631; GCN-NEXT: v_mul_lo_u32 v8, s15, v3 6632; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6633; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6634; GCN-NEXT: v_mul_lo_u32 v7, s14, v3 6635; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6636; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6637; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6638; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6639; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6640; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6641; GCN-NEXT: v_xor_b32_e32 v1, s3, v1 6642; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6643; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6644; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6645; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6646; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6647; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6648; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6649; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6650; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6651; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6652; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6653; GCN-NEXT: v_mul_lo_u32 v8, s14, v3 6654; GCN-NEXT: v_mul_hi_u32 v9, s14, v2 6655; GCN-NEXT: v_mul_lo_u32 v10, s15, v2 6656; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6657; GCN-NEXT: v_mul_lo_u32 v9, s14, v2 6658; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6659; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6660; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6661; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6662; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6663; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6664; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6665; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6666; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6667; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6668; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6669; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6670; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6671; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6672; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6673; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6674; GCN-NEXT: s_ashr_i32 s14, s11, 31 6675; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6676; GCN-NEXT: s_add_u32 s0, s10, s14 6677; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6678; GCN-NEXT: s_mov_b32 s15, s14 6679; GCN-NEXT: s_addc_u32 s1, s11, s14 6680; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6681; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6682; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6683; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6684; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6685; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6686; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6687; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6688; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6689; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6690; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6691; GCN-NEXT: v_mov_b32_e32 v8, s3 6692; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6693; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6694; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6695; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6696; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6697; GCN-NEXT: v_mul_lo_u32 v4, s12, v3 6698; GCN-NEXT: v_mul_hi_u32 v5, s12, v2 6699; GCN-NEXT: v_mul_lo_u32 v6, s13, v2 6700; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6701; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6702; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6703; GCN-NEXT: v_mul_lo_u32 v5, s12, v2 6704; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6705; GCN-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 6706; GCN-NEXT: v_mov_b32_e32 v7, s13 6707; GCN-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 6708; GCN-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 6709; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 6710; GCN-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 6711; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 6712; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6713; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 6714; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6715; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 6716; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 6717; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 6718; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 6719; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 6720; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 6721; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6722; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 6723; GCN-NEXT: v_mov_b32_e32 v8, s11 6724; GCN-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 6725; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 6726; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6727; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 6728; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6729; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 6730; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 6731; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 6732; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 6733; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6734; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 6735; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 6736; GCN-NEXT: v_xor_b32_e32 v2, s0, v2 6737; GCN-NEXT: v_xor_b32_e32 v3, s1, v3 6738; GCN-NEXT: v_mov_b32_e32 v4, s1 6739; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 6740; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6741; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6742; GCN-NEXT: s_endpgm 6743 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6744 %r = sdiv <2 x i64> %x, %shl.y 6745 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6746 ret void 6747} 6748 6749define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 6750; CHECK-LABEL: @srem_i64_oddk_denom( 6751; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 6752; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 6753; CHECK-NEXT: ret void 6754; 6755; GCN-LABEL: srem_i64_oddk_denom: 6756; GCN: ; %bb.0: 6757; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 6758; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 6759; GCN-NEXT: v_rcp_f32_e32 v0, v0 6760; GCN-NEXT: s_mov_b32 s2, 0xffed2705 6761; GCN-NEXT: v_mov_b32_e32 v8, 0 6762; GCN-NEXT: v_mov_b32_e32 v7, 0 6763; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6764; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6765; GCN-NEXT: v_trunc_f32_e32 v1, v1 6766; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6767; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6768; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6769; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6770; GCN-NEXT: s_mov_b32 s7, 0xf000 6771; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6772; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 6773; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 6774; GCN-NEXT: s_mov_b32 s6, -1 6775; GCN-NEXT: s_waitcnt lgkmcnt(0) 6776; GCN-NEXT: s_mov_b32 s4, s8 6777; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6778; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 6779; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6780; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 6781; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 6782; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 6783; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6784; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6785; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6786; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6787; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 6788; GCN-NEXT: s_mov_b32 s5, s9 6789; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6790; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 6791; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 6792; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6793; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6794; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6795; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6796; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 6797; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 6798; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6799; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 6800; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 6801; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 6802; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 6803; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 6804; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 6805; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 6806; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 6807; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6808; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 6809; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 6810; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 6811; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 6812; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 6813; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6814; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 6815; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6816; GCN-NEXT: s_ashr_i32 s2, s11, 31 6817; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 6818; GCN-NEXT: s_add_u32 s0, s10, s2 6819; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6820; GCN-NEXT: s_mov_b32 s3, s2 6821; GCN-NEXT: s_addc_u32 s1, s11, s2 6822; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 6823; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6824; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6825; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6826; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 6827; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 6828; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6829; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6830; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6831; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 6832; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6833; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 6834; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 6835; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6836; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 6837; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6838; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 6839; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 6840; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 6841; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 6842; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6843; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6844; GCN-NEXT: v_mov_b32_e32 v2, s1 6845; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 6846; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 6847; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 6848; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 6849; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 6850; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 6851; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 6852; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6853; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6854; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 6855; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 6856; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 6857; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 6858; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 6859; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 6860; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 6861; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6862; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6863; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6864; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6865; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6866; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 6867; GCN-NEXT: v_mov_b32_e32 v2, s2 6868; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6869; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6870; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6871; GCN-NEXT: s_endpgm 6872 %r = srem i64 %x, 1235195 6873 store i64 %r, i64 addrspace(1)* %out 6874 ret void 6875} 6876 6877define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 6878; CHECK-LABEL: @srem_i64_pow2k_denom( 6879; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 6880; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 6881; CHECK-NEXT: ret void 6882; 6883; GCN-LABEL: srem_i64_pow2k_denom: 6884; GCN: ; %bb.0: 6885; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 6886; GCN-NEXT: s_mov_b32 s3, 0xf000 6887; GCN-NEXT: s_mov_b32 s2, -1 6888; GCN-NEXT: s_waitcnt lgkmcnt(0) 6889; GCN-NEXT: s_mov_b32 s0, s4 6890; GCN-NEXT: s_ashr_i32 s4, s7, 31 6891; GCN-NEXT: s_lshr_b32 s4, s4, 20 6892; GCN-NEXT: s_add_u32 s4, s6, s4 6893; GCN-NEXT: s_mov_b32 s1, s5 6894; GCN-NEXT: s_addc_u32 s5, s7, 0 6895; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 6896; GCN-NEXT: s_sub_u32 s4, s6, s4 6897; GCN-NEXT: s_subb_u32 s5, s7, s5 6898; GCN-NEXT: v_mov_b32_e32 v0, s4 6899; GCN-NEXT: v_mov_b32_e32 v1, s5 6900; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6901; GCN-NEXT: s_endpgm 6902 %r = srem i64 %x, 4096 6903 store i64 %r, i64 addrspace(1)* %out 6904 ret void 6905} 6906 6907define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 6908; CHECK-LABEL: @srem_i64_pow2_shl_denom( 6909; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 6910; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 6911; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]] 6912; CHECK-NEXT: ret void 6913; 6914; GCN-LABEL: srem_i64_pow2_shl_denom: 6915; GCN: ; %bb.0: 6916; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 6917; GCN-NEXT: s_mov_b32 s3, 0 6918; GCN-NEXT: s_movk_i32 s2, 0x1000 6919; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6920; GCN-NEXT: s_mov_b32 s7, 0xf000 6921; GCN-NEXT: s_waitcnt lgkmcnt(0) 6922; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6923; GCN-NEXT: s_ashr_i32 s4, s3, 31 6924; GCN-NEXT: s_add_u32 s2, s2, s4 6925; GCN-NEXT: s_mov_b32 s5, s4 6926; GCN-NEXT: s_addc_u32 s3, s3, s4 6927; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 6928; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 6929; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 6930; GCN-NEXT: s_sub_u32 s2, 0, s12 6931; GCN-NEXT: s_subb_u32 s3, 0, s13 6932; GCN-NEXT: s_ashr_i32 s14, s11, 31 6933; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 6934; GCN-NEXT: v_rcp_f32_e32 v0, v0 6935; GCN-NEXT: s_mov_b32 s15, s14 6936; GCN-NEXT: s_mov_b32 s6, -1 6937; GCN-NEXT: s_mov_b32 s4, s8 6938; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6939; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6940; GCN-NEXT: v_trunc_f32_e32 v1, v1 6941; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6942; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6943; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6944; GCN-NEXT: s_mov_b32 s5, s9 6945; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6946; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 6947; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 6948; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 6949; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6950; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6951; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 6952; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6953; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6954; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6955; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6956; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6957; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6958; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6959; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6960; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6961; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 6962; GCN-NEXT: v_mov_b32_e32 v4, 0 6963; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6964; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6965; GCN-NEXT: v_mov_b32_e32 v6, 0 6966; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6967; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6968; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6969; GCN-NEXT: v_mul_lo_u32 v5, s2, v2 6970; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 6971; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 6972; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6973; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 6974; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6975; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6976; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6977; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6978; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6979; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6980; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6981; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6982; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6983; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6984; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6985; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6986; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6987; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6988; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6989; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6990; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6991; GCN-NEXT: s_add_u32 s0, s10, s14 6992; GCN-NEXT: s_addc_u32 s1, s11, s14 6993; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6994; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6995; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6996; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 6997; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 6998; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 6999; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 7000; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 7001; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7002; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 7003; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 7004; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 7005; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7006; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7007; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 7008; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7009; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 7010; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 7011; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 7012; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 7013; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 7014; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7015; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7016; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 7017; GCN-NEXT: v_mov_b32_e32 v3, s13 7018; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 7019; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7020; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 7021; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 7022; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 7023; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7024; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 7025; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 7026; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 7027; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7028; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 7029; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 7030; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7031; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 7032; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 7033; GCN-NEXT: v_mov_b32_e32 v5, s11 7034; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 7035; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 7036; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7037; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 7038; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7039; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 7040; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 7041; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7042; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7043; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 7044; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7045; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 7046; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 7047; GCN-NEXT: v_mov_b32_e32 v2, s14 7048; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 7049; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 7050; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7051; GCN-NEXT: s_endpgm 7052 %shl.y = shl i64 4096, %y 7053 %r = srem i64 %x, %shl.y 7054 store i64 %r, i64 addrspace(1)* %out 7055 ret void 7056} 7057 7058define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7059; CHECK-LABEL: @srem_v2i64_pow2k_denom( 7060; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7061; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 7062; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7063; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7064; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 7065; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7066; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]] 7067; CHECK-NEXT: ret void 7068; 7069; GCN-LABEL: srem_v2i64_pow2k_denom: 7070; GCN: ; %bb.0: 7071; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7072; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 7073; GCN-NEXT: s_movk_i32 s8, 0xf000 7074; GCN-NEXT: s_mov_b32 s7, 0xf000 7075; GCN-NEXT: s_mov_b32 s6, -1 7076; GCN-NEXT: s_waitcnt lgkmcnt(0) 7077; GCN-NEXT: s_ashr_i32 s9, s1, 31 7078; GCN-NEXT: s_lshr_b32 s9, s9, 20 7079; GCN-NEXT: s_add_u32 s9, s0, s9 7080; GCN-NEXT: s_addc_u32 s10, s1, 0 7081; GCN-NEXT: s_and_b32 s9, s9, s8 7082; GCN-NEXT: s_sub_u32 s0, s0, s9 7083; GCN-NEXT: s_subb_u32 s1, s1, s10 7084; GCN-NEXT: s_ashr_i32 s9, s3, 31 7085; GCN-NEXT: s_lshr_b32 s9, s9, 20 7086; GCN-NEXT: s_add_u32 s9, s2, s9 7087; GCN-NEXT: s_addc_u32 s10, s3, 0 7088; GCN-NEXT: s_and_b32 s8, s9, s8 7089; GCN-NEXT: s_sub_u32 s2, s2, s8 7090; GCN-NEXT: s_subb_u32 s3, s3, s10 7091; GCN-NEXT: v_mov_b32_e32 v0, s0 7092; GCN-NEXT: v_mov_b32_e32 v1, s1 7093; GCN-NEXT: v_mov_b32_e32 v2, s2 7094; GCN-NEXT: v_mov_b32_e32 v3, s3 7095; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7096; GCN-NEXT: s_endpgm 7097 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 7098 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7099 ret void 7100} 7101 7102define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 7103; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 7104; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 7105; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7106; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7107; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 7108; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 7109; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7110; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7111; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 7112; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7113; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]] 7114; CHECK-NEXT: ret void 7115; 7116; GCN-LABEL: srem_v2i64_pow2_shl_denom: 7117; GCN: ; %bb.0: 7118; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 7119; GCN-NEXT: s_mov_b32 s3, 0 7120; GCN-NEXT: s_movk_i32 s2, 0x1000 7121; GCN-NEXT: s_mov_b32 s18, 0x4f800000 7122; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 7123; GCN-NEXT: s_waitcnt lgkmcnt(0) 7124; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 7125; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 7126; GCN-NEXT: s_ashr_i32 s4, s3, 31 7127; GCN-NEXT: s_add_u32 s2, s2, s4 7128; GCN-NEXT: s_mov_b32 s5, s4 7129; GCN-NEXT: s_addc_u32 s3, s3, s4 7130; GCN-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 7131; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 7132; GCN-NEXT: v_cvt_f32_u32_e32 v1, s17 7133; GCN-NEXT: s_mov_b32 s20, 0x2f800000 7134; GCN-NEXT: s_mov_b32 s21, 0xcf800000 7135; GCN-NEXT: s_sub_u32 s6, 0, s16 7136; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 7137; GCN-NEXT: v_rcp_f32_e32 v0, v0 7138; GCN-NEXT: s_subb_u32 s7, 0, s17 7139; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7140; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 7141; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 7142; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 7143; GCN-NEXT: v_trunc_f32_e32 v1, v1 7144; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 7145; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 7146; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 7147; GCN-NEXT: s_waitcnt lgkmcnt(0) 7148; GCN-NEXT: s_ashr_i32 s12, s9, 31 7149; GCN-NEXT: s_add_u32 s0, s8, s12 7150; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 7151; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 7152; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 7153; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 7154; GCN-NEXT: s_mov_b32 s13, s12 7155; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7156; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7157; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 7158; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 7159; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 7160; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 7161; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 7162; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7163; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 7164; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 7165; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 7166; GCN-NEXT: s_addc_u32 s1, s9, s12 7167; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 7168; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 7169; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 7170; GCN-NEXT: v_mov_b32_e32 v4, 0 7171; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 7172; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7173; GCN-NEXT: v_mov_b32_e32 v6, 0 7174; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 7175; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 7176; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 7177; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 7178; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 7179; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 7180; GCN-NEXT: s_mov_b32 s7, 0xf000 7181; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 7182; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 7183; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 7184; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 7185; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 7186; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 7187; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 7188; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 7189; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 7190; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 7191; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 7192; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 7193; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 7194; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 7195; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 7196; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 7197; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 7198; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7199; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 7200; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7201; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7202; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 7203; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 7204; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 7205; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 7206; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 7207; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7208; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 7209; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 7210; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 7211; GCN-NEXT: s_mov_b32 s6, -1 7212; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7213; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7214; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 7215; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7216; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 7217; GCN-NEXT: v_mul_lo_u32 v1, s16, v1 7218; GCN-NEXT: v_mul_hi_u32 v2, s16, v0 7219; GCN-NEXT: v_mul_lo_u32 v3, s17, v0 7220; GCN-NEXT: v_mul_lo_u32 v0, s16, v0 7221; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7222; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7223; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 7224; GCN-NEXT: v_mov_b32_e32 v3, s17 7225; GCN-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 7226; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7227; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 7228; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] 7229; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 7230; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7231; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 7232; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 7233; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 7234; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 7235; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 7236; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 7237; GCN-NEXT: s_ashr_i32 s2, s15, 31 7238; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7239; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 7240; GCN-NEXT: s_add_u32 s8, s14, s2 7241; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 7242; GCN-NEXT: v_mov_b32_e32 v7, s9 7243; GCN-NEXT: s_mov_b32 s3, s2 7244; GCN-NEXT: s_addc_u32 s9, s15, s2 7245; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 7246; GCN-NEXT: v_cvt_f32_u32_e32 v8, s8 7247; GCN-NEXT: v_cvt_f32_u32_e32 v9, s9 7248; GCN-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc 7249; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 7250; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7251; GCN-NEXT: v_mac_f32_e32 v8, s18, v9 7252; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 7253; GCN-NEXT: v_rcp_f32_e32 v8, v8 7254; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 7255; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 7256; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 7257; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 7258; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7259; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 7260; GCN-NEXT: v_mul_f32_e32 v3, s19, v8 7261; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 7262; GCN-NEXT: v_trunc_f32_e32 v5, v5 7263; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 7264; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 7265; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 7266; GCN-NEXT: s_sub_u32 s2, 0, s8 7267; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7268; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 7269; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 7270; GCN-NEXT: s_subb_u32 s3, 0, s9 7271; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 7272; GCN-NEXT: s_ashr_i32 s14, s11, 31 7273; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 7274; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 7275; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 7276; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 7277; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 7278; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 7279; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 7280; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 7281; GCN-NEXT: s_mov_b32 s15, s14 7282; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 7283; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 7284; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 7285; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 7286; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 7287; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 7288; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 7289; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 7290; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 7291; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 7292; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 7293; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 7294; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 7295; GCN-NEXT: v_mul_lo_u32 v8, s2, v3 7296; GCN-NEXT: v_mul_hi_u32 v9, s2, v2 7297; GCN-NEXT: v_mul_lo_u32 v10, s3, v2 7298; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 7299; GCN-NEXT: v_mul_lo_u32 v9, s2, v2 7300; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 7301; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 7302; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 7303; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 7304; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 7305; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 7306; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 7307; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 7308; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 7309; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 7310; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 7311; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 7312; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 7313; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 7314; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 7315; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 7316; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 7317; GCN-NEXT: s_add_u32 s0, s10, s14 7318; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7319; GCN-NEXT: s_addc_u32 s1, s11, s14 7320; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 7321; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 7322; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 7323; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 7324; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 7325; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 7326; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 7327; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 7328; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 7329; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 7330; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 7331; GCN-NEXT: v_mov_b32_e32 v8, s12 7332; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 7333; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 7334; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 7335; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7336; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 7337; GCN-NEXT: v_mul_lo_u32 v3, s8, v3 7338; GCN-NEXT: v_mul_hi_u32 v4, s8, v2 7339; GCN-NEXT: v_mul_lo_u32 v5, s9, v2 7340; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 7341; GCN-NEXT: v_mul_lo_u32 v2, s8, v2 7342; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 7343; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7344; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 7345; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 7346; GCN-NEXT: v_mov_b32_e32 v5, s9 7347; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 7348; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7349; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 7350; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 7351; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 7352; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 7353; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 7354; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 7355; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 7356; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 7357; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 7358; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 7359; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7360; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 7361; GCN-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 7362; GCN-NEXT: v_mov_b32_e32 v7, s11 7363; GCN-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 7364; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 7365; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7366; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 7367; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 7368; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 7369; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 7370; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 7371; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7372; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7373; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 7374; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 7375; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 7376; GCN-NEXT: v_mov_b32_e32 v4, s14 7377; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 7378; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 7379; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7380; GCN-NEXT: s_endpgm 7381 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7382 %r = srem <2 x i64> %x, %shl.y 7383 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7384 ret void 7385} 7386