1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 4; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 5 6define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) { 7; R600-LABEL: test_udivrem: 8; R600: ; %bb.0: 9; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] 10; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 11; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 12; R600-NEXT: CF_END 13; R600-NEXT: ALU clause starting at 4: 14; R600-NEXT: SUB_INT T0.W, 0.0, KC0[9].X, 15; R600-NEXT: RECIP_UINT * T0.X, KC0[9].X, 16; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 17; R600-NEXT: MULHI * T0.Y, T0.X, PS, 18; R600-NEXT: ADD_INT * T0.W, T0.X, PS, 19; R600-NEXT: MULHI * T0.X, KC0[6].W, PV.W, 20; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[9].X, 21; R600-NEXT: SUB_INT * T0.W, KC0[6].W, PS, 22; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 23; R600-NEXT: SETGE_UINT * T2.W, PV.W, KC0[9].X, 24; R600-NEXT: CNDE_INT * T0.W, PS, T0.W, PV.W, 25; R600-NEXT: ADD_INT T0.Z, T0.X, 1, 26; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 27; R600-NEXT: SETGE_UINT * T3.W, PV.W, KC0[9].X, 28; R600-NEXT: CNDE_INT T1.X, PS, T0.W, PV.W, 29; R600-NEXT: CNDE_INT T0.W, T2.W, T0.X, PV.Z, 30; R600-NEXT: LSHR * T0.X, KC0[4].Z, literal.x, 31; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 32; R600-NEXT: ADD_INT * T1.W, PV.W, 1, 33; R600-NEXT: CNDE_INT T2.X, T3.W, T0.W, PV.W, 34; R600-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 35; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 36; 37; GFX6-LABEL: test_udivrem: 38; GFX6: ; %bb.0: 39; GFX6-NEXT: s_load_dword s2, s[0:1], 0x26 40; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 41; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 42; GFX6-NEXT: s_mov_b32 s7, 0xf000 43; GFX6-NEXT: s_mov_b32 s6, -1 44; GFX6-NEXT: s_mov_b32 s10, s6 45; GFX6-NEXT: s_waitcnt lgkmcnt(0) 46; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 47; GFX6-NEXT: s_sub_i32 s3, 0, s2 48; GFX6-NEXT: s_mov_b32 s11, s7 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 53; GFX6-NEXT: s_load_dword s3, s[0:1], 0x1d 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 56; GFX6-NEXT: s_waitcnt lgkmcnt(0) 57; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 58; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2 59; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 60; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 61; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 63; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 64; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 65; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 66; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 67; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 68; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 69; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; GFX6-NEXT: s_waitcnt expcnt(0) 71; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] 72; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 73; GFX6-NEXT: s_endpgm 74; 75; GFX8-LABEL: test_udivrem: 76; GFX8: ; %bb.0: 77; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98 78; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74 79; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 80; GFX8-NEXT: s_waitcnt lgkmcnt(0) 81; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 82; GFX8-NEXT: s_sub_i32 s2, 0, s6 83; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 84; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 85; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 86; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 87; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 88; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 89; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 90; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 91; GFX8-NEXT: s_waitcnt lgkmcnt(0) 92; GFX8-NEXT: v_mov_b32_e32 v0, s2 93; GFX8-NEXT: v_mov_b32_e32 v1, s3 94; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 95; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 96; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 97; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 98; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 99; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 100; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 101; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 102; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 103; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 104; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 105; GFX8-NEXT: flat_store_dword v[0:1], v2 106; GFX8-NEXT: v_mov_b32_e32 v0, s4 107; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 108; GFX8-NEXT: v_mov_b32_e32 v1, s5 109; GFX8-NEXT: flat_store_dword v[0:1], v2 110; GFX8-NEXT: s_endpgm 111 %result0 = udiv i32 %x, %y 112 store i32 %result0, i32 addrspace(1)* %out0 113 %result1 = urem i32 %x, %y 114 store i32 %result1, i32 addrspace(1)* %out1 115 ret void 116} 117 118define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 119; R600-LABEL: test_udivrem_v2: 120; R600: ; %bb.0: 121; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] 122; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 123; R600-NEXT: CF_END 124; R600-NEXT: PAD 125; R600-NEXT: ALU clause starting at 4: 126; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Z, 127; R600-NEXT: RECIP_UINT * T0.X, KC0[3].Z, 128; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 129; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Y, 130; R600-NEXT: RECIP_UINT * T0.Z, KC0[3].Y, 131; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 132; R600-NEXT: MULHI * T0.W, T0.Z, PS, 133; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 134; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 135; R600-NEXT: ADD_INT T1.W, T0.X, PS, 136; R600-NEXT: MULHI * T0.X, KC0[2].W, PV.W, 137; R600-NEXT: MULHI * T0.Y, KC0[3].X, PV.W, 138; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[3].Z, 139; R600-NEXT: SUB_INT T0.W, KC0[3].X, PS, 140; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[3].Y, 141; R600-NEXT: SUB_INT T0.Z, KC0[2].W, PS, 142; R600-NEXT: SETGE_UINT T1.W, PV.W, KC0[3].Z, 143; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[3].Z, 144; R600-NEXT: CNDE_INT T1.Z, PV.W, T0.W, PS, 145; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 146; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 147; R600-NEXT: CNDE_INT T0.Z, PV.W, T0.Z, PS, 148; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Z, 149; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Z, 150; R600-NEXT: CNDE_INT T0.Y, PV.W, T1.Z, PS, 151; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 152; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 153; R600-NEXT: CNDE_INT T0.X, PV.W, T0.Z, PS, 154; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 155; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 156; 157; GFX6-LABEL: test_udivrem_v2: 158; GFX6: ; %bb.0: 159; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 160; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe 161; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 162; GFX6-NEXT: s_mov_b32 s3, 0xf000 163; GFX6-NEXT: s_waitcnt lgkmcnt(0) 164; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 165; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 166; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 167; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 168; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 169; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 170; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 171; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 172; GFX6-NEXT: s_sub_i32 s2, 0, s6 173; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 174; GFX6-NEXT: s_sub_i32 s2, 0, s7 175; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 176; GFX6-NEXT: s_mov_b32 s2, -1 177; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 178; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 179; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 180; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 181; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 182; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 183; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 184; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 185; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 186; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 187; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 188; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 189; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 190; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 191; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 192; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 193; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 194; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 195; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 196; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 197; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 198; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 199; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 200; GFX6-NEXT: s_endpgm 201; 202; GFX8-LABEL: test_udivrem_v2: 203; GFX8: ; %bb.0: 204; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 205; GFX8-NEXT: s_mov_b32 s2, 0x4f7ffffe 206; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 207; GFX8-NEXT: s_waitcnt lgkmcnt(0) 208; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 209; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7 210; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 211; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 212; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0 213; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 214; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 215; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 216; GFX8-NEXT: s_sub_i32 s2, 0, s6 217; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 218; GFX8-NEXT: s_sub_i32 s2, 0, s7 219; GFX8-NEXT: v_mul_lo_u32 v3, s2, v1 220; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 221; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 223; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 224; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 225; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 226; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6 227; GFX8-NEXT: v_mul_lo_u32 v1, v1, s7 228; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 229; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 230; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 231; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1 232; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 233; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 234; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 235; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 236; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 237; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 238; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 239; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 240; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 241; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 242; GFX8-NEXT: v_mov_b32_e32 v3, s1 243; GFX8-NEXT: v_mov_b32_e32 v2, s0 244; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 245; GFX8-NEXT: s_endpgm 246 %result0 = udiv <2 x i32> %x, %y 247 store <2 x i32> %result0, <2 x i32> addrspace(1)* %out 248 %result1 = urem <2 x i32> %x, %y 249 store <2 x i32> %result1, <2 x i32> addrspace(1)* %out 250 ret void 251} 252 253define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 254; R600-LABEL: test_udivrem_v4: 255; R600: ; %bb.0: 256; R600-NEXT: ALU 57, @4, KC0[CB0:0-32], KC1[] 257; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 258; R600-NEXT: CF_END 259; R600-NEXT: PAD 260; R600-NEXT: ALU clause starting at 4: 261; R600-NEXT: SUB_INT T0.W, 0.0, KC0[5].X, 262; R600-NEXT: RECIP_UINT * T0.X, KC0[5].X, 263; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 264; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].Z, 265; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Z, 266; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 267; R600-NEXT: MULHI * T0.W, T0.Z, PS, 268; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 269; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 270; R600-NEXT: ADD_INT T1.W, T0.X, PS, 271; R600-NEXT: MULHI * T0.X, KC0[3].Z, PV.W, 272; R600-NEXT: MULHI * T0.Y, KC0[4].X, PV.W, 273; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[5].X, 274; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Y, 275; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].W, 276; R600-NEXT: RECIP_UINT * T1.X, KC0[4].W, 277; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 278; R600-NEXT: SUB_INT T1.W, 0.0, KC0[4].Y, 279; R600-NEXT: MULHI * T0.W, T1.X, PS, 280; R600-NEXT: ADD_INT T0.W, T1.X, PS, 281; R600-NEXT: MULLO_INT * T1.X, PV.W, T0.Z, 282; R600-NEXT: MULHI * T0.W, KC0[3].W, PV.W, 283; R600-NEXT: MULLO_INT * T0.W, PS, KC0[4].W, 284; R600-NEXT: SUB_INT T0.W, KC0[3].W, PS, 285; R600-NEXT: MULHI * T1.X, T0.Z, T1.X, 286; R600-NEXT: SETGE_UINT T1.Y, PV.W, KC0[4].W, 287; R600-NEXT: ADD_INT T0.Z, T0.Z, PS, 288; R600-NEXT: SUB_INT T1.W, KC0[4].X, T0.Y, 289; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[4].Z, 290; R600-NEXT: SUB_INT T0.Y, KC0[3].Z, PS, 291; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[5].X, 292; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[5].X, 293; R600-NEXT: MULHI * T0.X, KC0[3].Y, T0.Z, 294; R600-NEXT: SUB_INT T1.X, T0.W, KC0[4].W, 295; R600-NEXT: CNDE_INT T2.Y, T1.Z, T1.W, T2.W, 296; R600-NEXT: SETGE_UINT T0.Z, T0.Y, KC0[4].Z, 297; R600-NEXT: SUB_INT T1.W, T0.Y, KC0[4].Z, 298; R600-NEXT: MULLO_INT * T0.X, PS, KC0[4].Y, 299; R600-NEXT: CNDE_INT T2.X, PV.Z, T0.Y, PV.W, 300; R600-NEXT: SETGE_UINT T0.Y, PV.Y, KC0[5].X, 301; R600-NEXT: SUB_INT T0.Z, PV.Y, KC0[5].X, 302; R600-NEXT: SUB_INT T1.W, KC0[3].Y, PS, 303; R600-NEXT: CNDE_INT * T0.W, T1.Y, T0.W, PV.X, 304; R600-NEXT: SETGE_UINT T0.X, PS, KC0[4].W, 305; R600-NEXT: SUB_INT T1.Y, PS, KC0[4].W, 306; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[4].Y, 307; R600-NEXT: SUB_INT T2.W, PV.W, KC0[4].Y, 308; R600-NEXT: CNDE_INT * T3.W, PV.Y, T2.Y, PV.Z, 309; R600-NEXT: CNDE_INT T0.Y, PV.Z, T1.W, PV.W, 310; R600-NEXT: CNDE_INT T3.Z, PV.X, T0.W, PV.Y, BS:VEC_021/SCL_122 311; R600-NEXT: SETGE_UINT T0.W, T2.X, KC0[4].Z, 312; R600-NEXT: SUB_INT * T1.W, T2.X, KC0[4].Z, 313; R600-NEXT: CNDE_INT T3.Y, PV.W, T2.X, PS, 314; R600-NEXT: SETGE_UINT T0.W, PV.Y, KC0[4].Y, 315; R600-NEXT: SUB_INT * T1.W, PV.Y, KC0[4].Y, 316; R600-NEXT: CNDE_INT T3.X, PV.W, T0.Y, PS, 317; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 318; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 319; 320; GFX6-LABEL: test_udivrem_v4: 321; GFX6: ; %bb.0: 322; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 323; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 324; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 325; GFX6-NEXT: s_mov_b32 s3, 0xf000 326; GFX6-NEXT: s_waitcnt lgkmcnt(0) 327; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 328; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 329; GFX6-NEXT: s_sub_i32 s2, 0, s8 330; GFX6-NEXT: s_sub_i32 s12, 0, s9 331; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 332; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 333; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 334; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 335; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 336; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 337; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 338; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 339; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 340; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 341; GFX6-NEXT: s_mov_b32 s2, -1 342; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 343; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 344; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 345; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 346; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 347; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 348; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 349; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 350; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 351; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 352; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 353; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 354; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 355; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 356; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 357; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 358; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 359; GFX6-NEXT: s_sub_i32 s4, 0, s10 360; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 361; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 362; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 363; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 364; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 365; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 366; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 367; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 368; GFX6-NEXT: s_sub_i32 s4, 0, s11 369; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 370; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 371; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 372; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 373; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 374; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 375; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 376; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 377; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 378; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 379; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 380; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 381; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 382; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 383; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 384; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 385; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 386; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 387; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 388; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 389; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 390; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 391; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 392; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 393; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 394; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 395; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 396; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 397; GFX6-NEXT: s_endpgm 398; 399; GFX8-LABEL: test_udivrem_v4: 400; GFX8: ; %bb.0: 401; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 402; GFX8-NEXT: s_mov_b32 s12, 0x4f7ffffe 403; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 404; GFX8-NEXT: s_waitcnt lgkmcnt(0) 405; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 406; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s9 407; GFX8-NEXT: s_sub_i32 s2, 0, s8 408; GFX8-NEXT: s_sub_i32 s3, 0, s9 409; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 410; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 411; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s10 412; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11 413; GFX8-NEXT: v_mul_f32_e32 v0, s12, v0 414; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 415; GFX8-NEXT: v_mul_f32_e32 v1, s12, v1 416; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 417; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 418; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 419; GFX8-NEXT: s_sub_i32 s2, 0, s10 420; GFX8-NEXT: v_mul_lo_u32 v4, s3, v1 421; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 422; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 423; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 424; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 425; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 426; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 427; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 428; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3 429; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 430; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 431; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 432; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 433; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 434; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 435; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 436; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 437; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 438; GFX8-NEXT: v_mul_lo_u32 v3, s2, v2 439; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 440; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 441; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 442; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 443; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 444; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5 445; GFX8-NEXT: s_sub_i32 s2, 0, s11 446; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 447; GFX8-NEXT: v_mul_f32_e32 v3, s12, v4 448; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 449; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 450; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 451; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3 452; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 453; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 454; GFX8-NEXT: v_mul_lo_u32 v2, v2, s10 455; GFX8-NEXT: v_mul_hi_u32 v4, v3, v5 456; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 457; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 458; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 459; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 460; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 461; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11 462; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 463; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v2 464; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 465; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 466; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 467; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 468; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 469; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 470; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3 471; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 472; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 473; GFX8-NEXT: v_mov_b32_e32 v5, s1 474; GFX8-NEXT: v_mov_b32_e32 v4, s0 475; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 476; GFX8-NEXT: s_endpgm 477 %result0 = udiv <4 x i32> %x, %y 478 store <4 x i32> %result0, <4 x i32> addrspace(1)* %out 479 %result1 = urem <4 x i32> %x, %y 480 store <4 x i32> %result1, <4 x i32> addrspace(1)* %out 481 ret void 482} 483