1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s 3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 4; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 5 6define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) { 7; R600-LABEL: test_udivrem: 8; R600: ; %bb.0: 9; R600-NEXT: ALU 21, @4, KC0[CB0:0-32], KC1[] 10; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 11; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 12; R600-NEXT: CF_END 13; R600-NEXT: ALU clause starting at 4: 14; R600-NEXT: SUB_INT T0.W, 0.0, KC0[9].X, 15; R600-NEXT: RECIP_UINT * T0.X, KC0[9].X, 16; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 17; R600-NEXT: MULHI * T0.Y, T0.X, PS, 18; R600-NEXT: ADD_INT * T0.W, T0.X, PS, 19; R600-NEXT: MULHI * T0.X, KC0[6].W, PV.W, 20; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[9].X, 21; R600-NEXT: SUB_INT * T0.W, KC0[6].W, PS, 22; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 23; R600-NEXT: SETGE_UINT * T2.W, PV.W, KC0[9].X, 24; R600-NEXT: CNDE_INT * T0.W, PS, T0.W, PV.W, 25; R600-NEXT: ADD_INT T0.Z, T0.X, 1, 26; R600-NEXT: SUB_INT T1.W, PV.W, KC0[9].X, 27; R600-NEXT: SETGE_UINT * T3.W, PV.W, KC0[9].X, 28; R600-NEXT: CNDE_INT T1.X, PS, T0.W, PV.W, 29; R600-NEXT: CNDE_INT T0.W, T2.W, T0.X, PV.Z, 30; R600-NEXT: LSHR * T0.X, KC0[4].Z, literal.x, 31; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 32; R600-NEXT: ADD_INT * T1.W, PV.W, 1, 33; R600-NEXT: CNDE_INT T2.X, T3.W, T0.W, PV.W, 34; R600-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 35; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 36; 37; GFX6-LABEL: test_udivrem: 38; GFX6: ; %bb.0: 39; GFX6-NEXT: s_load_dword s2, s[0:1], 0x26 40; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 41; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 42; GFX6-NEXT: s_mov_b32 s7, 0xf000 43; GFX6-NEXT: s_mov_b32 s6, -1 44; GFX6-NEXT: s_mov_b32 s10, s6 45; GFX6-NEXT: s_waitcnt lgkmcnt(0) 46; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 47; GFX6-NEXT: s_sub_i32 s3, 0, s2 48; GFX6-NEXT: s_mov_b32 s11, s7 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 53; GFX6-NEXT: s_load_dword s3, s[0:1], 0x1d 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 56; GFX6-NEXT: s_waitcnt lgkmcnt(0) 57; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 58; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2 59; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 60; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 61; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 63; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 64; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 65; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 66; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 67; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 68; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 69; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; GFX6-NEXT: s_waitcnt expcnt(0) 71; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] 72; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 73; GFX6-NEXT: s_endpgm 74; 75; GFX8-LABEL: test_udivrem: 76; GFX8: ; %bb.0: 77; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98 78; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74 79; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c 80; GFX8-NEXT: s_waitcnt lgkmcnt(0) 81; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 82; GFX8-NEXT: s_sub_i32 s2, 0, s6 83; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 84; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 85; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 86; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 87; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 88; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 89; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 90; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 91; GFX8-NEXT: s_waitcnt lgkmcnt(0) 92; GFX8-NEXT: v_mov_b32_e32 v0, s2 93; GFX8-NEXT: v_mov_b32_e32 v1, s3 94; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 95; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 96; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 97; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 98; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 99; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 100; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 101; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 102; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 103; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 104; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 105; GFX8-NEXT: flat_store_dword v[0:1], v2 106; GFX8-NEXT: v_mov_b32_e32 v0, s4 107; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 108; GFX8-NEXT: v_mov_b32_e32 v1, s5 109; GFX8-NEXT: flat_store_dword v[0:1], v2 110; GFX8-NEXT: s_endpgm 111 %result0 = udiv i32 %x, %y 112 store i32 %result0, i32 addrspace(1)* %out0 113 %result1 = urem i32 %x, %y 114 store i32 %result1, i32 addrspace(1)* %out1 115 ret void 116} 117 118define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 119; R600-LABEL: test_udivrem_v2: 120; R600: ; %bb.0: 121; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] 122; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 123; R600-NEXT: CF_END 124; R600-NEXT: PAD 125; R600-NEXT: ALU clause starting at 4: 126; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Z, 127; R600-NEXT: RECIP_UINT * T0.X, KC0[3].Z, 128; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 129; R600-NEXT: SUB_INT T0.W, 0.0, KC0[3].Y, 130; R600-NEXT: RECIP_UINT * T0.Z, KC0[3].Y, 131; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 132; R600-NEXT: MULHI * T0.W, T0.Z, PS, 133; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 134; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 135; R600-NEXT: ADD_INT T1.W, T0.X, PS, 136; R600-NEXT: MULHI * T0.X, KC0[2].W, PV.W, 137; R600-NEXT: MULHI * T0.Y, KC0[3].X, PV.W, 138; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[3].Z, 139; R600-NEXT: SUB_INT T0.W, KC0[3].X, PS, 140; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[3].Y, 141; R600-NEXT: SUB_INT T0.Z, KC0[2].W, PS, 142; R600-NEXT: SETGE_UINT T1.W, PV.W, KC0[3].Z, 143; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[3].Z, 144; R600-NEXT: CNDE_INT T1.Z, PV.W, T0.W, PS, 145; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 146; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 147; R600-NEXT: CNDE_INT T0.Z, PV.W, T0.Z, PS, 148; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Z, 149; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Z, 150; R600-NEXT: CNDE_INT T0.Y, PV.W, T1.Z, PS, 151; R600-NEXT: SETGE_UINT T0.W, PV.Z, KC0[3].Y, 152; R600-NEXT: SUB_INT * T1.W, PV.Z, KC0[3].Y, 153; R600-NEXT: CNDE_INT T0.X, PV.W, T0.Z, PS, 154; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 155; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 156; 157; GFX6-LABEL: test_udivrem_v2: 158; GFX6: ; %bb.0: 159; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 160; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 161; GFX6-NEXT: s_mov_b32 s3, 0xf000 162; GFX6-NEXT: s_waitcnt lgkmcnt(0) 163; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 164; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 165; GFX6-NEXT: s_sub_i32 s2, 0, s6 166; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 167; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 168; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 169; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 170; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 171; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 172; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 173; GFX6-NEXT: s_sub_i32 s2, 0, s7 174; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 175; GFX6-NEXT: s_mov_b32 s2, -1 176; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 177; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 178; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 179; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 180; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 181; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 182; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 183; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 184; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 185; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 186; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 187; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 188; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 189; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 190; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 191; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 192; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 193; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 194; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 195; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 196; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 197; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 198; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 199; GFX6-NEXT: s_endpgm 200; 201; GFX8-LABEL: test_udivrem_v2: 202; GFX8: ; %bb.0: 203; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 204; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 205; GFX8-NEXT: s_waitcnt lgkmcnt(0) 206; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 207; GFX8-NEXT: s_sub_i32 s2, 0, s6 208; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 209; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 210; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 211; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 212; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 213; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 214; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 215; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 216; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 217; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 218; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 219; GFX8-NEXT: v_mov_b32_e32 v3, s1 220; GFX8-NEXT: v_readfirstlane_b32 s2, v0 221; GFX8-NEXT: s_mul_i32 s2, s2, s6 222; GFX8-NEXT: s_sub_i32 s2, s4, s2 223; GFX8-NEXT: s_sub_i32 s3, s2, s6 224; GFX8-NEXT: s_cmp_ge_u32 s2, s6 225; GFX8-NEXT: s_cselect_b32 s2, s3, s2 226; GFX8-NEXT: s_sub_i32 s3, s2, s6 227; GFX8-NEXT: s_cmp_ge_u32 s2, s6 228; GFX8-NEXT: s_cselect_b32 s2, s3, s2 229; GFX8-NEXT: s_sub_i32 s3, 0, s7 230; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 231; GFX8-NEXT: v_mov_b32_e32 v2, s0 232; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 233; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 234; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 235; GFX8-NEXT: v_mov_b32_e32 v0, s2 236; GFX8-NEXT: v_readfirstlane_b32 s2, v1 237; GFX8-NEXT: s_mul_i32 s2, s2, s7 238; GFX8-NEXT: s_sub_i32 s2, s5, s2 239; GFX8-NEXT: s_sub_i32 s3, s2, s7 240; GFX8-NEXT: s_cmp_ge_u32 s2, s7 241; GFX8-NEXT: s_cselect_b32 s2, s3, s2 242; GFX8-NEXT: s_sub_i32 s3, s2, s7 243; GFX8-NEXT: s_cmp_ge_u32 s2, s7 244; GFX8-NEXT: s_cselect_b32 s2, s3, s2 245; GFX8-NEXT: v_mov_b32_e32 v1, s2 246; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 247; GFX8-NEXT: s_endpgm 248 %result0 = udiv <2 x i32> %x, %y 249 store <2 x i32> %result0, <2 x i32> addrspace(1)* %out 250 %result1 = urem <2 x i32> %x, %y 251 store <2 x i32> %result1, <2 x i32> addrspace(1)* %out 252 ret void 253} 254 255define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 256; R600-LABEL: test_udivrem_v4: 257; R600: ; %bb.0: 258; R600-NEXT: ALU 57, @4, KC0[CB0:0-32], KC1[] 259; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 260; R600-NEXT: CF_END 261; R600-NEXT: PAD 262; R600-NEXT: ALU clause starting at 4: 263; R600-NEXT: SUB_INT T0.W, 0.0, KC0[5].X, 264; R600-NEXT: RECIP_UINT * T0.X, KC0[5].X, 265; R600-NEXT: MULLO_INT * T0.Y, PV.W, PS, 266; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].Z, 267; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Z, 268; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 269; R600-NEXT: MULHI * T0.W, T0.Z, PS, 270; R600-NEXT: ADD_INT T0.W, T0.Z, PS, 271; R600-NEXT: MULHI * T0.Y, T0.X, T0.Y, 272; R600-NEXT: ADD_INT T1.W, T0.X, PS, 273; R600-NEXT: MULHI * T0.X, KC0[3].Z, PV.W, 274; R600-NEXT: MULHI * T0.Y, KC0[4].X, PV.W, 275; R600-NEXT: MULLO_INT * T0.Y, PS, KC0[5].X, 276; R600-NEXT: RECIP_UINT * T0.Z, KC0[4].Y, 277; R600-NEXT: SUB_INT T0.W, 0.0, KC0[4].W, 278; R600-NEXT: RECIP_UINT * T1.X, KC0[4].W, 279; R600-NEXT: MULLO_INT * T0.W, PV.W, PS, 280; R600-NEXT: SUB_INT T1.W, 0.0, KC0[4].Y, 281; R600-NEXT: MULHI * T0.W, T1.X, PS, 282; R600-NEXT: ADD_INT T0.W, T1.X, PS, 283; R600-NEXT: MULLO_INT * T1.X, PV.W, T0.Z, 284; R600-NEXT: MULHI * T0.W, KC0[3].W, PV.W, 285; R600-NEXT: MULLO_INT * T0.W, PS, KC0[4].W, 286; R600-NEXT: SUB_INT T0.W, KC0[3].W, PS, 287; R600-NEXT: MULHI * T1.X, T0.Z, T1.X, 288; R600-NEXT: SETGE_UINT T1.Y, PV.W, KC0[4].W, 289; R600-NEXT: ADD_INT T0.Z, T0.Z, PS, 290; R600-NEXT: SUB_INT T1.W, KC0[4].X, T0.Y, 291; R600-NEXT: MULLO_INT * T0.X, T0.X, KC0[4].Z, 292; R600-NEXT: SUB_INT T0.Y, KC0[3].Z, PS, 293; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[5].X, 294; R600-NEXT: SUB_INT * T2.W, PV.W, KC0[5].X, 295; R600-NEXT: MULHI * T0.X, KC0[3].Y, T0.Z, 296; R600-NEXT: SUB_INT T1.X, T0.W, KC0[4].W, 297; R600-NEXT: CNDE_INT T2.Y, T1.Z, T1.W, T2.W, 298; R600-NEXT: SETGE_UINT T0.Z, T0.Y, KC0[4].Z, 299; R600-NEXT: SUB_INT T1.W, T0.Y, KC0[4].Z, 300; R600-NEXT: MULLO_INT * T0.X, PS, KC0[4].Y, 301; R600-NEXT: CNDE_INT T2.X, PV.Z, T0.Y, PV.W, 302; R600-NEXT: SETGE_UINT T0.Y, PV.Y, KC0[5].X, 303; R600-NEXT: SUB_INT T0.Z, PV.Y, KC0[5].X, 304; R600-NEXT: SUB_INT T1.W, KC0[3].Y, PS, 305; R600-NEXT: CNDE_INT * T0.W, T1.Y, T0.W, PV.X, 306; R600-NEXT: SETGE_UINT T0.X, PS, KC0[4].W, 307; R600-NEXT: SUB_INT T1.Y, PS, KC0[4].W, 308; R600-NEXT: SETGE_UINT T1.Z, PV.W, KC0[4].Y, 309; R600-NEXT: SUB_INT T2.W, PV.W, KC0[4].Y, 310; R600-NEXT: CNDE_INT * T3.W, PV.Y, T2.Y, PV.Z, 311; R600-NEXT: CNDE_INT T0.Y, PV.Z, T1.W, PV.W, 312; R600-NEXT: CNDE_INT T3.Z, PV.X, T0.W, PV.Y, BS:VEC_021/SCL_122 313; R600-NEXT: SETGE_UINT T0.W, T2.X, KC0[4].Z, 314; R600-NEXT: SUB_INT * T1.W, T2.X, KC0[4].Z, 315; R600-NEXT: CNDE_INT T3.Y, PV.W, T2.X, PS, 316; R600-NEXT: SETGE_UINT T0.W, PV.Y, KC0[4].Y, 317; R600-NEXT: SUB_INT * T1.W, PV.Y, KC0[4].Y, 318; R600-NEXT: CNDE_INT T3.X, PV.W, T0.Y, PS, 319; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 320; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 321; 322; GFX6-LABEL: test_udivrem_v4: 323; GFX6: ; %bb.0: 324; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 325; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 326; GFX6-NEXT: s_mov_b32 s3, 0xf000 327; GFX6-NEXT: s_mov_b32 s2, -1 328; GFX6-NEXT: s_waitcnt lgkmcnt(0) 329; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 330; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 331; GFX6-NEXT: s_sub_i32 s12, 0, s8 332; GFX6-NEXT: s_sub_i32 s13, 0, s9 333; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 334; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 335; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 336; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 337; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 338; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 339; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 340; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 341; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 342; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 343; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 344; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 345; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 346; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 347; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 348; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 349; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 350; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 351; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 352; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 353; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 354; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 355; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 356; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 357; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 358; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 359; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 360; GFX6-NEXT: s_sub_i32 s4, 0, s10 361; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 362; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 363; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 364; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 365; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 366; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 367; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 368; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 369; GFX6-NEXT: s_sub_i32 s4, 0, s11 370; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 371; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 372; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 373; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 374; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 375; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 376; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 377; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 378; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 379; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 380; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 381; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 382; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 383; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 384; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 385; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 386; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 387; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 388; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 389; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 390; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 391; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 392; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 393; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 394; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 395; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 396; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 397; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 398; GFX6-NEXT: s_endpgm 399; 400; GFX8-LABEL: test_udivrem_v4: 401; GFX8: ; %bb.0: 402; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 403; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 404; GFX8-NEXT: s_waitcnt lgkmcnt(0) 405; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 406; GFX8-NEXT: s_sub_i32 s2, 0, s8 407; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 408; GFX8-NEXT: v_mov_b32_e32 v5, s1 409; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 410; GFX8-NEXT: v_mov_b32_e32 v4, s0 411; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 412; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 413; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 414; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 415; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 416; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 417; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 418; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 419; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 420; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 421; GFX8-NEXT: v_readfirstlane_b32 s2, v0 422; GFX8-NEXT: s_mul_i32 s2, s2, s8 423; GFX8-NEXT: s_sub_i32 s2, s4, s2 424; GFX8-NEXT: s_sub_i32 s3, s2, s8 425; GFX8-NEXT: s_cmp_ge_u32 s2, s8 426; GFX8-NEXT: s_cselect_b32 s2, s3, s2 427; GFX8-NEXT: s_sub_i32 s3, s2, s8 428; GFX8-NEXT: s_cmp_ge_u32 s2, s8 429; GFX8-NEXT: s_cselect_b32 s2, s3, s2 430; GFX8-NEXT: s_sub_i32 s3, 0, s9 431; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 432; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 433; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 434; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 435; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 436; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 437; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 438; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 439; GFX8-NEXT: v_readfirstlane_b32 s3, v0 440; GFX8-NEXT: s_mul_i32 s3, s3, s9 441; GFX8-NEXT: s_sub_i32 s3, s5, s3 442; GFX8-NEXT: s_sub_i32 s4, s3, s9 443; GFX8-NEXT: s_cmp_ge_u32 s3, s9 444; GFX8-NEXT: s_cselect_b32 s3, s4, s3 445; GFX8-NEXT: s_sub_i32 s4, s3, s9 446; GFX8-NEXT: s_cmp_ge_u32 s3, s9 447; GFX8-NEXT: s_cselect_b32 s3, s4, s3 448; GFX8-NEXT: s_sub_i32 s4, 0, s10 449; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 450; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 451; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 452; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 453; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 454; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 455; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 456; GFX8-NEXT: v_readfirstlane_b32 s4, v0 457; GFX8-NEXT: s_mul_i32 s4, s4, s10 458; GFX8-NEXT: s_sub_i32 s4, s6, s4 459; GFX8-NEXT: s_sub_i32 s5, s4, s10 460; GFX8-NEXT: s_cmp_ge_u32 s4, s10 461; GFX8-NEXT: s_cselect_b32 s4, s5, s4 462; GFX8-NEXT: s_sub_i32 s5, s4, s10 463; GFX8-NEXT: s_cmp_ge_u32 s4, s10 464; GFX8-NEXT: s_cselect_b32 s4, s5, s4 465; GFX8-NEXT: s_sub_i32 s5, 0, s11 466; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1 467; GFX8-NEXT: v_mov_b32_e32 v2, s4 468; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 469; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 470; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 471; GFX8-NEXT: v_mov_b32_e32 v0, s2 472; GFX8-NEXT: v_mov_b32_e32 v1, s3 473; GFX8-NEXT: v_readfirstlane_b32 s2, v3 474; GFX8-NEXT: s_mul_i32 s2, s2, s11 475; GFX8-NEXT: s_sub_i32 s2, s7, s2 476; GFX8-NEXT: s_sub_i32 s3, s2, s11 477; GFX8-NEXT: s_cmp_ge_u32 s2, s11 478; GFX8-NEXT: s_cselect_b32 s2, s3, s2 479; GFX8-NEXT: s_sub_i32 s3, s2, s11 480; GFX8-NEXT: s_cmp_ge_u32 s2, s11 481; GFX8-NEXT: s_cselect_b32 s2, s3, s2 482; GFX8-NEXT: v_mov_b32_e32 v3, s2 483; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 484; GFX8-NEXT: s_endpgm 485 %result0 = udiv <4 x i32> %x, %y 486 store <4 x i32> %result0, <4 x i32> addrspace(1)* %out 487 %result1 = urem <4 x i32> %x, %y 488 store <4 x i32> %result1, <4 x i32> addrspace(1)* %out 489 ret void 490} 491