1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn | FileCheck %s -check-prefixes=FUNC,SI,GCN 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,TONGA 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,GFX9 5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s -check-prefixes=FUNC,EG 6 7; The code generated by sdiv is long and complex and may frequently change. 8; The goal of this test is to make sure the ISel doesn't fail. 9; 10; This program was previously failing to compile when one of the selectcc 11; opcodes generated by the sdiv lowering was being legalized and optimized to: 12; selectcc Remainder -1, 0, -1, SETGT 13; This was fixed by adding an additional pattern in R600Instructions.td to 14; match this pattern with a CNDGE_INT. 15 16define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 17; GCN-LABEL: sdiv_i32: 18; GCN: ; %bb.0: 19; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 20; GCN-NEXT: s_mov_b32 s7, 0xf000 21; GCN-NEXT: s_mov_b32 s6, -1 22; GCN-NEXT: s_mov_b32 s10, s6 23; GCN-NEXT: s_mov_b32 s11, s7 24; GCN-NEXT: s_waitcnt lgkmcnt(0) 25; GCN-NEXT: s_mov_b32 s8, s2 26; GCN-NEXT: s_mov_b32 s9, s3 27; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 28; GCN-NEXT: s_mov_b32 s4, s0 29; GCN-NEXT: s_mov_b32 s5, s1 30; GCN-NEXT: s_waitcnt vmcnt(0) 31; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 32; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 33; GCN-NEXT: v_xor_b32_e32 v4, v2, v3 34; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 35; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 36; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 37; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 38; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 39; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 40; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 41; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 42; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 43; GCN-NEXT: v_mul_lo_u32 v5, v2, v1 44; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 45; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 46; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] 47; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 48; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2 49; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 50; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 51; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 52; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 53; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 54; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2 55; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0 56; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 57; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 58; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc 59; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1] 60; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc 61; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 62; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 63; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 64; GCN-NEXT: s_endpgm 65; 66; TONGA-LABEL: sdiv_i32: 67; TONGA: ; %bb.0: 68; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 69; TONGA-NEXT: s_mov_b32 s7, 0xf000 70; TONGA-NEXT: s_mov_b32 s6, -1 71; TONGA-NEXT: s_mov_b32 s2, s6 72; TONGA-NEXT: s_mov_b32 s3, s7 73; TONGA-NEXT: s_waitcnt lgkmcnt(0) 74; TONGA-NEXT: s_mov_b32 s0, s10 75; TONGA-NEXT: s_mov_b32 s1, s11 76; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 77; TONGA-NEXT: s_mov_b32 s4, s8 78; TONGA-NEXT: s_mov_b32 s5, s9 79; TONGA-NEXT: s_waitcnt vmcnt(0) 80; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 81; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 82; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2 83; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 84; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v0 85; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v0 86; TONGA-NEXT: v_xor_b32_e32 v0, v0, v6 87; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 88; TONGA-NEXT: v_xor_b32_e32 v2, v6, v2 89; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 90; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 91; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 92; TONGA-NEXT: v_mul_hi_u32 v5, v3, v1 93; TONGA-NEXT: v_sub_u32_e32 v7, vcc, 0, v4 94; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 95; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 96; TONGA-NEXT: v_mul_hi_u32 v4, v4, v3 97; TONGA-NEXT: v_add_u32_e32 v5, vcc, v4, v3 98; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v4, v3 99; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 100; TONGA-NEXT: v_mul_hi_u32 v3, v3, v0 101; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 102; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 103; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v3 104; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v4, v0 105; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 106; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 107; TONGA-NEXT: s_and_b64 s[0:1], s[0:1], vcc 108; TONGA-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] 109; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc 110; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 111; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 112; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 113; TONGA-NEXT: s_endpgm 114; 115; GFX9-LABEL: sdiv_i32: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX9-NEXT: s_mov_b32 s7, 0xf000 119; GFX9-NEXT: s_mov_b32 s6, -1 120; GFX9-NEXT: s_mov_b32 s10, s6 121; GFX9-NEXT: s_mov_b32 s11, s7 122; GFX9-NEXT: s_waitcnt lgkmcnt(0) 123; GFX9-NEXT: s_mov_b32 s8, s2 124; GFX9-NEXT: s_mov_b32 s9, s3 125; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 126; GFX9-NEXT: s_mov_b32 s4, s0 127; GFX9-NEXT: s_mov_b32 s5, s1 128; GFX9-NEXT: s_waitcnt vmcnt(0) 129; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 130; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 131; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 132; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1 133; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 134; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 135; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 136; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 137; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 138; GFX9-NEXT: v_sub_u32_e32 v6, 0, v4 139; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 140; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 141; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3 142; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 143; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 144; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 145; GFX9-NEXT: v_add_u32_e32 v6, v3, v4 146; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 147; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 148; GFX9-NEXT: v_mul_hi_u32 v3, v3, v0 149; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2 150; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 151; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 152; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 153; GFX9-NEXT: v_sub_u32_e32 v7, v0, v4 154; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 155; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 156; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc 157; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] 158; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc 159; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 160; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 161; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 162; GFX9-NEXT: s_endpgm 163; 164; EG-LABEL: sdiv_i32: 165; EG: ; %bb.0: 166; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 167; EG-NEXT: TEX 0 @6 168; EG-NEXT: ALU 30, @9, KC0[CB0:0-32], KC1[] 169; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 170; EG-NEXT: CF_END 171; EG-NEXT: PAD 172; EG-NEXT: Fetch clause starting at 6: 173; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 174; EG-NEXT: ALU clause starting at 8: 175; EG-NEXT: MOV * T0.X, KC0[2].Z, 176; EG-NEXT: ALU clause starting at 9: 177; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y, 178; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W, 179; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, 180; EG-NEXT: RECIP_UINT * T0.Y, PV.W, 181; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, 182; EG-NEXT: SUB_INT T2.W, 0.0, PS, 183; EG-NEXT: MULHI * T1.X, T0.Y, T1.W, 184; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Z, 185; EG-NEXT: SETGT_INT * T3.W, 0.0, T0.X, 186; EG-NEXT: MULHI * T0.Z, PV.W, T0.Y, 187; EG-NEXT: ADD_INT T1.Z, T0.X, T3.W, 188; EG-NEXT: ADD_INT T2.W, T0.Y, PS, 189; EG-NEXT: SUB_INT * T4.W, T0.Y, PS, 190; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, 191; EG-NEXT: XOR_INT * T4.W, PV.Z, T3.W, 192; EG-NEXT: MULHI * T0.X, PV.W, PS, 193; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W, 194; EG-NEXT: SUB_INT * T2.W, T4.W, PS, 195; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W, 196; EG-NEXT: SETGE_UINT * T2.W, T4.W, T0.Y, 197; EG-NEXT: AND_INT T1.W, PV.W, PS, 198; EG-NEXT: ADD_INT * T4.W, T0.X, 1, 199; EG-NEXT: CNDE_INT T1.W, PV.W, T0.X, PS, 200; EG-NEXT: ADD_INT * T4.W, T0.X, literal.x, 201; EG-NEXT: -1(nan), 0(0.000000e+00) 202; EG-NEXT: CNDE_INT T1.W, T2.W, PS, PV.W, 203; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W, 204; EG-NEXT: XOR_INT * T1.W, PV.W, PS, 205; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 206; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 207; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 208 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 209 %num = load i32, i32 addrspace(1) * %in 210 %den = load i32, i32 addrspace(1) * %den_ptr 211 %result = sdiv i32 %num, %den 212 store i32 %result, i32 addrspace(1)* %out 213 ret void 214} 215 216define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 217; GCN-LABEL: sdiv_i32_4: 218; GCN: ; %bb.0: 219; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 220; GCN-NEXT: s_mov_b32 s7, 0xf000 221; GCN-NEXT: s_mov_b32 s6, -1 222; GCN-NEXT: s_mov_b32 s10, s6 223; GCN-NEXT: s_mov_b32 s11, s7 224; GCN-NEXT: s_waitcnt lgkmcnt(0) 225; GCN-NEXT: s_mov_b32 s8, s2 226; GCN-NEXT: s_mov_b32 s9, s3 227; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 228; GCN-NEXT: s_mov_b32 s4, s0 229; GCN-NEXT: s_mov_b32 s5, s1 230; GCN-NEXT: s_waitcnt vmcnt(0) 231; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 232; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 233; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 234; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 235; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 236; GCN-NEXT: s_endpgm 237; 238; TONGA-LABEL: sdiv_i32_4: 239; TONGA: ; %bb.0: 240; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 241; TONGA-NEXT: s_mov_b32 s3, 0xf000 242; TONGA-NEXT: s_mov_b32 s2, -1 243; TONGA-NEXT: s_waitcnt lgkmcnt(0) 244; TONGA-NEXT: s_mov_b32 s0, s4 245; TONGA-NEXT: s_mov_b32 s1, s5 246; TONGA-NEXT: s_mov_b32 s4, s6 247; TONGA-NEXT: s_mov_b32 s5, s7 248; TONGA-NEXT: s_mov_b32 s6, s2 249; TONGA-NEXT: s_mov_b32 s7, s3 250; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0 251; TONGA-NEXT: s_waitcnt vmcnt(0) 252; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 253; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 254; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 255; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 256; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 257; TONGA-NEXT: s_endpgm 258; 259; GFX9-LABEL: sdiv_i32_4: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 262; GFX9-NEXT: s_mov_b32 s3, 0xf000 263; GFX9-NEXT: s_mov_b32 s2, -1 264; GFX9-NEXT: s_waitcnt lgkmcnt(0) 265; GFX9-NEXT: s_mov_b32 s0, s4 266; GFX9-NEXT: s_mov_b32 s1, s5 267; GFX9-NEXT: s_mov_b32 s4, s6 268; GFX9-NEXT: s_mov_b32 s5, s7 269; GFX9-NEXT: s_mov_b32 s6, s2 270; GFX9-NEXT: s_mov_b32 s7, s3 271; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 272; GFX9-NEXT: s_waitcnt vmcnt(0) 273; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 274; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 275; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 276; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 277; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 278; GFX9-NEXT: s_endpgm 279; 280; EG-LABEL: sdiv_i32_4: 281; EG: ; %bb.0: 282; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 283; EG-NEXT: TEX 0 @6 284; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] 285; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 286; EG-NEXT: CF_END 287; EG-NEXT: PAD 288; EG-NEXT: Fetch clause starting at 6: 289; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 290; EG-NEXT: ALU clause starting at 8: 291; EG-NEXT: MOV * T0.X, KC0[2].Z, 292; EG-NEXT: ALU clause starting at 9: 293; EG-NEXT: ASHR * T0.W, T0.X, literal.x, 294; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 295; EG-NEXT: LSHR * T0.W, PV.W, literal.x, 296; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 297; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 298; EG-NEXT: ASHR T0.X, PV.W, literal.x, 299; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 300; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 301 %num = load i32, i32 addrspace(1) * %in 302 %result = sdiv i32 %num, 4 303 store i32 %result, i32 addrspace(1)* %out 304 ret void 305} 306 307; Multiply by a weird constant to make sure setIntDivIsCheap is 308; working. 309 310define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 311; GCN-LABEL: slow_sdiv_i32_3435: 312; GCN: ; %bb.0: 313; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 314; GCN-NEXT: s_mov_b32 s7, 0xf000 315; GCN-NEXT: s_mov_b32 s6, -1 316; GCN-NEXT: s_mov_b32 s10, s6 317; GCN-NEXT: s_mov_b32 s11, s7 318; GCN-NEXT: s_waitcnt lgkmcnt(0) 319; GCN-NEXT: s_mov_b32 s8, s2 320; GCN-NEXT: s_mov_b32 s9, s3 321; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 322; GCN-NEXT: s_mov_b32 s2, 0x98a1930b 323; GCN-NEXT: s_mov_b32 s4, s0 324; GCN-NEXT: s_mov_b32 s5, s1 325; GCN-NEXT: s_waitcnt vmcnt(0) 326; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 327; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 328; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 329; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 330; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 331; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 332; GCN-NEXT: s_endpgm 333; 334; TONGA-LABEL: slow_sdiv_i32_3435: 335; TONGA: ; %bb.0: 336; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 337; TONGA-NEXT: s_mov_b32 s3, 0xf000 338; TONGA-NEXT: s_mov_b32 s2, -1 339; TONGA-NEXT: s_mov_b32 s10, s2 340; TONGA-NEXT: s_mov_b32 s11, s3 341; TONGA-NEXT: s_waitcnt lgkmcnt(0) 342; TONGA-NEXT: s_mov_b32 s8, s6 343; TONGA-NEXT: s_mov_b32 s9, s7 344; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 345; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b 346; TONGA-NEXT: s_mov_b32 s1, s5 347; TONGA-NEXT: s_waitcnt vmcnt(0) 348; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0 349; TONGA-NEXT: s_mov_b32 s0, s4 350; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 351; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 352; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 353; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 354; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 355; TONGA-NEXT: s_endpgm 356; 357; GFX9-LABEL: slow_sdiv_i32_3435: 358; GFX9: ; %bb.0: 359; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 360; GFX9-NEXT: s_mov_b32 s3, 0xf000 361; GFX9-NEXT: s_mov_b32 s2, -1 362; GFX9-NEXT: s_mov_b32 s10, s2 363; GFX9-NEXT: s_mov_b32 s11, s3 364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 365; GFX9-NEXT: s_mov_b32 s8, s6 366; GFX9-NEXT: s_mov_b32 s9, s7 367; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 368; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b 369; GFX9-NEXT: s_mov_b32 s1, s5 370; GFX9-NEXT: s_waitcnt vmcnt(0) 371; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 372; GFX9-NEXT: s_mov_b32 s0, s4 373; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 374; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 375; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 376; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 377; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 378; GFX9-NEXT: s_endpgm 379; 380; EG-LABEL: slow_sdiv_i32_3435: 381; EG: ; %bb.0: 382; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 383; EG-NEXT: TEX 0 @6 384; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] 385; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 386; EG-NEXT: CF_END 387; EG-NEXT: PAD 388; EG-NEXT: Fetch clause starting at 6: 389; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 390; EG-NEXT: ALU clause starting at 8: 391; EG-NEXT: MOV * T0.X, KC0[2].Z, 392; EG-NEXT: ALU clause starting at 9: 393; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 394; EG-NEXT: -1734241525(-4.176600e-24), 0(0.000000e+00) 395; EG-NEXT: ADD_INT * T0.W, PS, T0.X, 396; EG-NEXT: ASHR T1.W, PV.W, literal.x, 397; EG-NEXT: LSHR * T0.W, PV.W, literal.y, 398; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44) 399; EG-NEXT: ADD_INT T0.X, PV.W, PS, 400; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 401; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 402 %num = load i32, i32 addrspace(1) * %in 403 %result = sdiv i32 %num, 3435 404 store i32 %result, i32 addrspace(1)* %out 405 ret void 406} 407 408define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 409; GCN-LABEL: sdiv_v2i32: 410; GCN: ; %bb.0: 411; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 412; GCN-NEXT: s_mov_b32 s11, 0xf000 413; GCN-NEXT: s_mov_b32 s10, -1 414; GCN-NEXT: s_mov_b32 s6, s10 415; GCN-NEXT: s_mov_b32 s7, s11 416; GCN-NEXT: s_waitcnt lgkmcnt(0) 417; GCN-NEXT: s_mov_b32 s4, s2 418; GCN-NEXT: s_mov_b32 s5, s3 419; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 420; GCN-NEXT: s_mov_b32 s2, 0x4f800000 421; GCN-NEXT: s_mov_b32 s8, s0 422; GCN-NEXT: s_mov_b32 s9, s1 423; GCN-NEXT: s_waitcnt vmcnt(0) 424; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 425; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 426; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 427; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 428; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 429; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 430; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 431; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 432; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 433; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 434; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 435; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 436; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 437; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 438; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 439; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 440; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 441; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 442; GCN-NEXT: v_mul_f32_e32 v4, s2, v4 443; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 444; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 445; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 446; GCN-NEXT: v_mul_hi_u32 v6, v4, v2 447; GCN-NEXT: v_mul_lo_u32 v7, v4, v2 448; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 449; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 450; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v7 451; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v11 452; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 453; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1] 454; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10 455; GCN-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3] 456; GCN-NEXT: v_mul_hi_u32 v6, v6, v4 457; GCN-NEXT: v_mul_hi_u32 v7, v7, v5 458; GCN-NEXT: v_add_i32_e32 v10, vcc, v6, v4 459; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4 460; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v5 461; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v7, v5 462; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] 463; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3] 464; GCN-NEXT: v_mul_hi_u32 v4, v4, v0 465; GCN-NEXT: v_mul_hi_u32 v5, v5, v1 466; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 467; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 468; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v4 469; GCN-NEXT: v_mul_lo_u32 v11, v5, v3 470; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v5 471; GCN-NEXT: v_add_i32_e32 v13, vcc, -1, v5 472; GCN-NEXT: v_subrev_i32_e32 v14, vcc, v6, v0 473; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6 474; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v11, v1 475; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 476; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2 477; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 478; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] 479; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3] 480; GCN-NEXT: s_and_b64 s[2:3], s[4:5], vcc 481; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3] 482; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1] 483; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc 484; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 485; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 486; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 487; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 488; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 489; GCN-NEXT: s_endpgm 490; 491; TONGA-LABEL: sdiv_v2i32: 492; TONGA: ; %bb.0: 493; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 494; TONGA-NEXT: s_mov_b32 s11, 0xf000 495; TONGA-NEXT: s_mov_b32 s10, -1 496; TONGA-NEXT: s_mov_b32 s4, 0x4f800000 497; TONGA-NEXT: s_waitcnt lgkmcnt(0) 498; TONGA-NEXT: s_mov_b32 s8, s0 499; TONGA-NEXT: s_mov_b32 s9, s1 500; TONGA-NEXT: s_mov_b32 s0, s2 501; TONGA-NEXT: s_mov_b32 s1, s3 502; TONGA-NEXT: s_mov_b32 s2, s10 503; TONGA-NEXT: s_mov_b32 s3, s11 504; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 505; TONGA-NEXT: s_waitcnt vmcnt(0) 506; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 507; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 508; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2 509; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 510; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 511; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 512; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 513; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5 514; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2 515; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 516; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7 517; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3 518; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 519; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 520; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 521; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 522; TONGA-NEXT: v_mul_f32_e32 v4, s4, v5 523; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v4 524; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 525; TONGA-NEXT: v_mul_f32_e32 v5, s4, v7 526; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 527; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 528; TONGA-NEXT: v_mul_hi_u32 v6, v4, v2 529; TONGA-NEXT: v_mul_lo_u32 v7, v4, v2 530; TONGA-NEXT: v_mul_hi_u32 v10, v5, v3 531; TONGA-NEXT: v_mul_lo_u32 v11, v5, v3 532; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 533; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v7 534; TONGA-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1] 535; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v11 536; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10 537; TONGA-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3] 538; TONGA-NEXT: v_mul_hi_u32 v6, v6, v4 539; TONGA-NEXT: v_mul_hi_u32 v7, v7, v5 540; TONGA-NEXT: v_add_u32_e32 v10, vcc, v6, v4 541; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v6, v4 542; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] 543; TONGA-NEXT: v_add_u32_e32 v6, vcc, v7, v5 544; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v7, v5 545; TONGA-NEXT: v_mul_hi_u32 v4, v4, v0 546; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3] 547; TONGA-NEXT: v_mul_hi_u32 v5, v5, v1 548; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 549; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 550; TONGA-NEXT: v_mul_lo_u32 v11, v5, v3 551; TONGA-NEXT: v_add_u32_e32 v10, vcc, -1, v4 552; TONGA-NEXT: v_subrev_u32_e32 v14, vcc, v6, v0 553; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6 554; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2 555; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v11, v1 556; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v5 557; TONGA-NEXT: v_add_u32_e32 v13, vcc, -1, v5 558; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11 559; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 560; TONGA-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] 561; TONGA-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3] 562; TONGA-NEXT: s_and_b64 s[2:3], s[4:5], vcc 563; TONGA-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3] 564; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1] 565; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc 566; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 567; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 568; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 569; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 570; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 571; TONGA-NEXT: s_endpgm 572; 573; GFX9-LABEL: sdiv_v2i32: 574; GFX9: ; %bb.0: 575; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 576; GFX9-NEXT: s_mov_b32 s11, 0xf000 577; GFX9-NEXT: s_mov_b32 s10, -1 578; GFX9-NEXT: s_mov_b32 s4, 0x4f800000 579; GFX9-NEXT: s_waitcnt lgkmcnt(0) 580; GFX9-NEXT: s_mov_b32 s8, s0 581; GFX9-NEXT: s_mov_b32 s9, s1 582; GFX9-NEXT: s_mov_b32 s0, s2 583; GFX9-NEXT: s_mov_b32 s1, s3 584; GFX9-NEXT: s_mov_b32 s2, s10 585; GFX9-NEXT: s_mov_b32 s3, s11 586; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 587; GFX9-NEXT: s_waitcnt vmcnt(0) 588; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v2 589; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v3 590; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 591; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 592; GFX9-NEXT: v_xor_b32_e32 v2, v2, v5 593; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 594; GFX9-NEXT: v_xor_b32_e32 v3, v3, v6 595; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v3 596; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 597; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 598; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 599; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 600; GFX9-NEXT: v_xor_b32_e32 v5, v4, v5 601; GFX9-NEXT: v_mul_f32_e32 v7, s4, v7 602; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 603; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8 604; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 605; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 606; GFX9-NEXT: v_mul_lo_u32 v4, v7, v2 607; GFX9-NEXT: v_mul_hi_u32 v11, v7, v2 608; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3 609; GFX9-NEXT: v_mul_hi_u32 v12, v8, v3 610; GFX9-NEXT: v_sub_u32_e32 v13, 0, v4 611; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 612; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc 613; GFX9-NEXT: v_sub_u32_e32 v14, 0, v10 614; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v12 615; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[0:1] 616; GFX9-NEXT: v_mul_hi_u32 v4, v4, v7 617; GFX9-NEXT: v_mul_hi_u32 v10, v10, v8 618; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1 619; GFX9-NEXT: v_add_u32_e32 v1, v1, v9 620; GFX9-NEXT: v_xor_b32_e32 v6, v9, v6 621; GFX9-NEXT: v_xor_b32_e32 v1, v1, v9 622; GFX9-NEXT: v_add_u32_e32 v9, v7, v4 623; GFX9-NEXT: v_sub_u32_e32 v4, v7, v4 624; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc 625; GFX9-NEXT: v_add_u32_e32 v7, v8, v10 626; GFX9-NEXT: v_sub_u32_e32 v8, v8, v10 627; GFX9-NEXT: v_mul_hi_u32 v4, v4, v0 628; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] 629; GFX9-NEXT: v_mul_hi_u32 v7, v7, v1 630; GFX9-NEXT: v_mul_lo_u32 v8, v4, v2 631; GFX9-NEXT: v_add_u32_e32 v9, 1, v4 632; GFX9-NEXT: v_mul_lo_u32 v11, v7, v3 633; GFX9-NEXT: v_add_u32_e32 v12, 1, v7 634; GFX9-NEXT: v_sub_u32_e32 v14, v0, v8 635; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 636; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2 637; GFX9-NEXT: v_sub_u32_e32 v0, v1, v11 638; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v11 639; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 640; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], vcc 641; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v9, s[2:3] 642; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] 643; GFX9-NEXT: v_add_u32_e32 v10, -1, v4 644; GFX9-NEXT: v_add_u32_e32 v13, -1, v7 645; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v12, s[2:3] 646; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc 647; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[0:1] 648; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 649; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6 650; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 651; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 652; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 653; GFX9-NEXT: s_endpgm 654; 655; EG-LABEL: sdiv_v2i32: 656; EG: ; %bb.0: 657; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 658; EG-NEXT: TEX 1 @6 659; EG-NEXT: ALU 59, @11, KC0[CB0:0-32], KC1[] 660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 661; EG-NEXT: CF_END 662; EG-NEXT: PAD 663; EG-NEXT: Fetch clause starting at 6: 664; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 665; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 666; EG-NEXT: ALU clause starting at 10: 667; EG-NEXT: MOV * T0.X, KC0[2].Z, 668; EG-NEXT: ALU clause starting at 11: 669; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Y, 670; EG-NEXT: ADD_INT * T1.W, T1.Y, PV.W, 671; EG-NEXT: XOR_INT T1.W, PV.W, T0.W, 672; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.X, 673; EG-NEXT: ADD_INT T3.W, T1.X, PS, 674; EG-NEXT: RECIP_UINT * T0.Z, PV.W, 675; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, BS:VEC_021/SCL_122 676; EG-NEXT: MULLO_INT * T1.X, PS, T1.W, 677; EG-NEXT: RECIP_UINT * T1.Y, PV.W, 678; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W, 679; EG-NEXT: SUB_INT T4.W, 0.0, PS, 680; EG-NEXT: MULHI * T2.X, T1.Y, T3.W, 681; EG-NEXT: CNDE_INT T1.Z, PS, PV.W, T1.Z, BS:VEC_021/SCL_122 682; EG-NEXT: SUB_INT T4.W, 0.0, T1.X, 683; EG-NEXT: MULHI * T2.Y, T0.Z, T1.W, 684; EG-NEXT: CNDE_INT T2.Z, PS, PV.W, T1.X, 685; EG-NEXT: SETGT_INT T4.W, 0.0, T0.X, 686; EG-NEXT: MULHI * T1.X, PV.Z, T1.Y, 687; EG-NEXT: SETGT_INT T3.X, 0.0, T0.Y, 688; EG-NEXT: ADD_INT T3.Y, T0.X, PV.W, 689; EG-NEXT: ADD_INT T1.Z, T1.Y, PS, 690; EG-NEXT: SUB_INT T5.W, T1.Y, PS, 691; EG-NEXT: MULHI * T0.X, PV.Z, T0.Z, 692; EG-NEXT: CNDE_INT T1.X, T2.X, PV.Z, PV.W, 693; EG-NEXT: XOR_INT T1.Y, PV.Y, T4.W, 694; EG-NEXT: ADD_INT T1.Z, T0.Y, PV.X, 695; EG-NEXT: ADD_INT T5.W, T0.Z, PS, 696; EG-NEXT: SUB_INT * T6.W, T0.Z, PS, 697; EG-NEXT: CNDE_INT T0.Z, T2.Y, PV.W, PS, 698; EG-NEXT: XOR_INT T5.W, PV.Z, T3.X, 699; EG-NEXT: MULHI * T0.X, PV.X, PV.Y, 700; EG-NEXT: MULHI * T0.Y, PV.Z, PV.W, 701; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, 702; EG-NEXT: SUB_INT T6.W, T5.W, PS, 703; EG-NEXT: MULLO_INT * T1.X, T0.X, T3.W, 704; EG-NEXT: SUB_INT T1.Z, T1.Y, PS, 705; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W, 706; EG-NEXT: SETGE_UINT * T5.W, T5.W, T0.Z, 707; EG-NEXT: AND_INT T2.Y, PV.W, PS, 708; EG-NEXT: ADD_INT T0.Z, T0.Y, 1, 709; EG-NEXT: SETGE_UINT T1.W, PV.Z, T3.W, 710; EG-NEXT: SETGE_UINT * T3.W, T1.Y, T1.X, 711; EG-NEXT: AND_INT T1.Y, PV.W, PS, 712; EG-NEXT: ADD_INT T1.Z, T0.X, 1, 713; EG-NEXT: CNDE_INT T1.W, PV.Y, T0.Y, PV.Z, 714; EG-NEXT: ADD_INT * T6.W, T0.Y, literal.x, 715; EG-NEXT: -1(nan), 0(0.000000e+00) 716; EG-NEXT: CNDE_INT T0.Y, T5.W, PS, PV.W, 717; EG-NEXT: XOR_INT T0.Z, T3.X, T0.W, 718; EG-NEXT: CNDE_INT T0.W, PV.Y, T0.X, PV.Z, 719; EG-NEXT: ADD_INT * T1.W, T0.X, literal.x, 720; EG-NEXT: -1(nan), 0(0.000000e+00) 721; EG-NEXT: CNDE_INT T1.Z, T3.W, PS, PV.W, 722; EG-NEXT: XOR_INT T0.W, T4.W, T2.W, BS:VEC_120/SCL_212 723; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z, 724; EG-NEXT: SUB_INT T0.Y, PS, T0.Z, 725; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W, 726; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 727; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 728; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 729 %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 730 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in 731 %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr 732 %result = sdiv <2 x i32> %num, %den 733 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 734 ret void 735} 736 737define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 738; GCN-LABEL: sdiv_v2i32_4: 739; GCN: ; %bb.0: 740; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 741; GCN-NEXT: s_mov_b32 s7, 0xf000 742; GCN-NEXT: s_mov_b32 s6, -1 743; GCN-NEXT: s_mov_b32 s10, s6 744; GCN-NEXT: s_mov_b32 s11, s7 745; GCN-NEXT: s_waitcnt lgkmcnt(0) 746; GCN-NEXT: s_mov_b32 s8, s2 747; GCN-NEXT: s_mov_b32 s9, s3 748; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 749; GCN-NEXT: s_mov_b32 s4, s0 750; GCN-NEXT: s_mov_b32 s5, s1 751; GCN-NEXT: s_waitcnt vmcnt(0) 752; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 753; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 754; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 755; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 756; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 757; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 758; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 759; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 760; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 761; GCN-NEXT: s_endpgm 762; 763; TONGA-LABEL: sdiv_v2i32_4: 764; TONGA: ; %bb.0: 765; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 766; TONGA-NEXT: s_mov_b32 s3, 0xf000 767; TONGA-NEXT: s_mov_b32 s2, -1 768; TONGA-NEXT: s_waitcnt lgkmcnt(0) 769; TONGA-NEXT: s_mov_b32 s0, s4 770; TONGA-NEXT: s_mov_b32 s1, s5 771; TONGA-NEXT: s_mov_b32 s4, s6 772; TONGA-NEXT: s_mov_b32 s5, s7 773; TONGA-NEXT: s_mov_b32 s6, s2 774; TONGA-NEXT: s_mov_b32 s7, s3 775; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 776; TONGA-NEXT: s_waitcnt vmcnt(0) 777; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 778; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 779; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2 780; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3 781; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 782; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 783; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 784; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 785; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 786; TONGA-NEXT: s_endpgm 787; 788; GFX9-LABEL: sdiv_v2i32_4: 789; GFX9: ; %bb.0: 790; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 791; GFX9-NEXT: s_mov_b32 s3, 0xf000 792; GFX9-NEXT: s_mov_b32 s2, -1 793; GFX9-NEXT: s_waitcnt lgkmcnt(0) 794; GFX9-NEXT: s_mov_b32 s0, s4 795; GFX9-NEXT: s_mov_b32 s1, s5 796; GFX9-NEXT: s_mov_b32 s4, s6 797; GFX9-NEXT: s_mov_b32 s5, s7 798; GFX9-NEXT: s_mov_b32 s6, s2 799; GFX9-NEXT: s_mov_b32 s7, s3 800; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 801; GFX9-NEXT: s_waitcnt vmcnt(0) 802; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 803; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 804; GFX9-NEXT: v_lshrrev_b32_e32 v2, 30, v2 805; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v3 806; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 807; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 808; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 809; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 810; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 811; GFX9-NEXT: s_endpgm 812; 813; EG-LABEL: sdiv_v2i32_4: 814; EG: ; %bb.0: 815; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 816; EG-NEXT: TEX 0 @6 817; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] 818; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 819; EG-NEXT: CF_END 820; EG-NEXT: PAD 821; EG-NEXT: Fetch clause starting at 6: 822; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 823; EG-NEXT: ALU clause starting at 8: 824; EG-NEXT: MOV * T0.X, KC0[2].Z, 825; EG-NEXT: ALU clause starting at 9: 826; EG-NEXT: ASHR * T0.W, T0.Y, literal.x, 827; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 828; EG-NEXT: LSHR T0.W, PV.W, literal.x, 829; EG-NEXT: ASHR * T1.W, T0.X, literal.y, 830; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 831; EG-NEXT: LSHR T1.W, PS, literal.x, 832; EG-NEXT: ADD_INT * T0.W, T0.Y, PV.W, 833; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 834; EG-NEXT: ASHR T0.Y, PS, literal.x, 835; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 836; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 837; EG-NEXT: ASHR T0.X, PV.W, literal.x, 838; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 839; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 840 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in 841 %result = sdiv <2 x i32> %num, <i32 4, i32 4> 842 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 843 ret void 844} 845 846define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 847; GCN-LABEL: sdiv_v4i32: 848; GCN: ; %bb.0: 849; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 850; GCN-NEXT: s_mov_b32 s19, 0xf000 851; GCN-NEXT: s_mov_b32 s18, -1 852; GCN-NEXT: s_mov_b32 s2, s18 853; GCN-NEXT: s_mov_b32 s3, s19 854; GCN-NEXT: s_waitcnt lgkmcnt(0) 855; GCN-NEXT: s_mov_b32 s0, s10 856; GCN-NEXT: s_mov_b32 s1, s11 857; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 858; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 859; GCN-NEXT: s_mov_b32 s6, 0x4f800000 860; GCN-NEXT: s_waitcnt vmcnt(1) 861; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 862; GCN-NEXT: s_waitcnt vmcnt(0) 863; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 864; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 865; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 866; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 867; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 868; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v3 869; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v7 870; GCN-NEXT: v_xor_b32_e32 v16, v8, v9 871; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 872; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 873; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 874; GCN-NEXT: v_add_i32_e32 v3, vcc, v14, v3 875; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 876; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 877; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 878; GCN-NEXT: v_add_i32_e32 v7, vcc, v15, v7 879; GCN-NEXT: v_xor_b32_e32 v17, v10, v11 880; GCN-NEXT: v_xor_b32_e32 v18, v12, v13 881; GCN-NEXT: v_xor_b32_e32 v19, v14, v15 882; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 883; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 884; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 885; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 886; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 887; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 888; GCN-NEXT: v_xor_b32_e32 v3, v3, v14 889; GCN-NEXT: v_xor_b32_e32 v7, v7, v15 890; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 891; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5 892; GCN-NEXT: v_cvt_f32_u32_e32 v10, v6 893; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 894; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 895; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 896; GCN-NEXT: v_mul_f32_e32 v8, s6, v8 897; GCN-NEXT: v_mul_f32_e32 v9, s6, v9 898; GCN-NEXT: v_mul_f32_e32 v10, s6, v10 899; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 900; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 901; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 902; GCN-NEXT: v_mul_hi_u32 v11, v8, v4 903; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 904; GCN-NEXT: v_mul_hi_u32 v13, v9, v5 905; GCN-NEXT: v_mul_lo_u32 v14, v9, v5 906; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v12 907; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 908; GCN-NEXT: v_mul_hi_u32 v11, v10, v6 909; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[0:1] 910; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v14 911; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v13 912; GCN-NEXT: v_mul_lo_u32 v13, v10, v6 913; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[2:3] 914; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v13 915; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 916; GCN-NEXT: v_cvt_f32_u32_e32 v11, v7 917; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 918; GCN-NEXT: v_mul_f32_e32 v11, s6, v11 919; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 920; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] 921; GCN-NEXT: v_mul_hi_u32 v15, v11, v7 922; GCN-NEXT: v_mul_lo_u32 v20, v11, v7 923; GCN-NEXT: v_sub_i32_e32 v21, vcc, 0, v20 924; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15 925; GCN-NEXT: v_cndmask_b32_e64 v15, v20, v21, s[6:7] 926; GCN-NEXT: v_mul_hi_u32 v12, v12, v8 927; GCN-NEXT: v_add_i32_e32 v20, vcc, v12, v8 928; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v12, v8 929; GCN-NEXT: v_mul_hi_u32 v12, v14, v9 930; GCN-NEXT: v_add_i32_e32 v14, vcc, v12, v9 931; GCN-NEXT: v_subrev_i32_e32 v9, vcc, v12, v9 932; GCN-NEXT: v_mul_hi_u32 v12, v13, v10 933; GCN-NEXT: v_add_i32_e32 v13, vcc, v12, v10 934; GCN-NEXT: v_subrev_i32_e32 v10, vcc, v12, v10 935; GCN-NEXT: v_mul_hi_u32 v12, v15, v11 936; GCN-NEXT: v_add_i32_e32 v15, vcc, v12, v11 937; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v12, v11 938; GCN-NEXT: s_mov_b32 s16, s8 939; GCN-NEXT: s_mov_b32 s17, s9 940; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v20, s[0:1] 941; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[2:3] 942; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5] 943; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[6:7] 944; GCN-NEXT: v_mul_hi_u32 v8, v8, v0 945; GCN-NEXT: v_mul_hi_u32 v9, v9, v1 946; GCN-NEXT: v_mul_hi_u32 v10, v10, v2 947; GCN-NEXT: v_mul_hi_u32 v11, v11, v3 948; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 949; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v8 950; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v8 951; GCN-NEXT: v_mul_lo_u32 v15, v9, v5 952; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v12 953; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 954; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v9 955; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 956; GCN-NEXT: v_add_i32_e32 v0, vcc, -1, v9 957; GCN-NEXT: v_mul_lo_u32 v4, v10, v6 958; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v15 959; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v15 960; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v10 961; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 962; GCN-NEXT: v_add_i32_e32 v1, vcc, -1, v10 963; GCN-NEXT: v_mul_lo_u32 v5, v11, v7 964; GCN-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 965; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 966; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v11 967; GCN-NEXT: v_cmp_ge_u32_e64 s[10:11], v3, v5 968; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 969; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v11 970; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 971; GCN-NEXT: v_cmp_ge_u32_e64 s[12:13], v3, v7 972; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] 973; GCN-NEXT: v_cndmask_b32_e64 v2, v8, v13, s[2:3] 974; GCN-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] 975; GCN-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[2:3] 976; GCN-NEXT: s_and_b64 vcc, vcc, s[8:9] 977; GCN-NEXT: v_cndmask_b32_e32 v6, v10, v15, vcc 978; GCN-NEXT: s_and_b64 vcc, s[12:13], s[10:11] 979; GCN-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc 980; GCN-NEXT: v_cndmask_b32_e64 v2, v14, v2, s[0:1] 981; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] 982; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] 983; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[10:11] 984; GCN-NEXT: v_xor_b32_e32 v2, v2, v16 985; GCN-NEXT: v_xor_b32_e32 v4, v0, v17 986; GCN-NEXT: v_xor_b32_e32 v5, v1, v18 987; GCN-NEXT: v_xor_b32_e32 v3, v3, v19 988; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v16 989; GCN-NEXT: v_sub_i32_e32 v1, vcc, v4, v17 990; GCN-NEXT: v_sub_i32_e32 v2, vcc, v5, v18 991; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 992; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 993; GCN-NEXT: s_endpgm 994; 995; TONGA-LABEL: sdiv_v4i32: 996; TONGA: ; %bb.0: 997; TONGA-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 998; TONGA-NEXT: s_mov_b32 s11, 0xf000 999; TONGA-NEXT: s_mov_b32 s10, -1 1000; TONGA-NEXT: s_mov_b32 s2, s10 1001; TONGA-NEXT: s_mov_b32 s3, s11 1002; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1003; TONGA-NEXT: s_mov_b32 s0, s14 1004; TONGA-NEXT: s_mov_b32 s1, s15 1005; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1006; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 1007; TONGA-NEXT: s_mov_b32 s14, 0x4f800000 1008; TONGA-NEXT: s_mov_b32 s8, s12 1009; TONGA-NEXT: s_mov_b32 s9, s13 1010; TONGA-NEXT: s_waitcnt vmcnt(1) 1011; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 1012; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 1013; TONGA-NEXT: s_waitcnt vmcnt(0) 1014; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 1015; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 1016; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 1017; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v4 1018; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 1019; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 1020; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 1021; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9 1022; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 1023; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 1024; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v5 1025; TONGA-NEXT: v_mul_f32_e32 v9, s14, v9 1026; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 1027; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 1028; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 1029; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 1030; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 1031; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 1032; TONGA-NEXT: v_mul_f32_e32 v8, s14, v8 1033; TONGA-NEXT: v_mul_hi_u32 v11, v9, v4 1034; TONGA-NEXT: v_mul_lo_u32 v10, v9, v4 1035; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 1036; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 1037; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 1038; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 1039; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 1040; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 1041; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 1042; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v10 1043; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1] 1044; TONGA-NEXT: v_mul_hi_u32 v12, v8, v5 1045; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 1046; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 1047; TONGA-NEXT: v_mul_lo_u32 v11, v8, v5 1048; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v12 1049; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v6 1050; TONGA-NEXT: v_mul_hi_u32 v10, v10, v9 1051; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v11 1052; TONGA-NEXT: v_cndmask_b32_e64 v11, v11, v13, s[2:3] 1053; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 1054; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 1055; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 1056; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 1057; TONGA-NEXT: v_mul_f32_e32 v12, s14, v12 1058; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 1059; TONGA-NEXT: v_mul_hi_u32 v18, v12, v6 1060; TONGA-NEXT: v_mul_lo_u32 v13, v12, v6 1061; TONGA-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 1062; TONGA-NEXT: v_add_u32_e32 v18, vcc, v10, v9 1063; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v9 1064; TONGA-NEXT: v_mul_hi_u32 v10, v11, v8 1065; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v18, s[0:1] 1066; TONGA-NEXT: v_mul_hi_u32 v9, v9, v0 1067; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v13 1068; TONGA-NEXT: v_add_u32_e32 v11, vcc, v10, v8 1069; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v10, v8 1070; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5] 1071; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[2:3] 1072; TONGA-NEXT: v_mul_hi_u32 v10, v13, v12 1073; TONGA-NEXT: v_mul_lo_u32 v11, v9, v4 1074; TONGA-NEXT: v_mul_hi_u32 v8, v8, v1 1075; TONGA-NEXT: v_add_u32_e32 v13, vcc, v10, v12 1076; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, v10, v12 1077; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v11 1078; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11 1079; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 1080; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5] 1081; TONGA-NEXT: v_mul_lo_u32 v0, v8, v5 1082; TONGA-NEXT: v_mul_hi_u32 v4, v10, v2 1083; TONGA-NEXT: v_add_u32_e32 v12, vcc, -1, v9 1084; TONGA-NEXT: v_add_u32_e32 v10, vcc, -1, v8 1085; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v0 1086; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 1087; TONGA-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v5 1088; TONGA-NEXT: v_mul_lo_u32 v5, v4, v6 1089; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9 1090; TONGA-NEXT: v_add_u32_e32 v0, vcc, 1, v8 1091; TONGA-NEXT: s_and_b64 vcc, s[2:3], s[0:1] 1092; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 1093; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v2, v5 1094; TONGA-NEXT: s_and_b64 vcc, s[6:7], s[4:5] 1095; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v7 1096; TONGA-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 1097; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1] 1098; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5] 1099; TONGA-NEXT: v_xor_b32_e32 v1, v1, v15 1100; TONGA-NEXT: v_xor_b32_e32 v8, v0, v16 1101; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v15 1102; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v8, v16 1103; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v11 1104; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v9, v6 1105; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v5 1106; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 1107; TONGA-NEXT: v_mul_f32_e32 v8, s14, v8 1108; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 1109; TONGA-NEXT: v_add_u32_e32 v3, vcc, v10, v3 1110; TONGA-NEXT: v_xor_b32_e32 v3, v3, v10 1111; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v4 1112; TONGA-NEXT: v_mul_lo_u32 v5, v8, v7 1113; TONGA-NEXT: v_mul_hi_u32 v9, v8, v7 1114; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v4 1115; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v5 1116; TONGA-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 1117; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] 1118; TONGA-NEXT: v_mul_hi_u32 v5, v5, v8 1119; TONGA-NEXT: v_add_u32_e32 v9, vcc, v5, v8 1120; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v5, v8 1121; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 1122; TONGA-NEXT: v_mul_hi_u32 v5, v5, v3 1123; TONGA-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1124; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 1125; TONGA-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] 1126; TONGA-NEXT: v_mul_lo_u32 v4, v5, v7 1127; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 1128; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 1129; TONGA-NEXT: v_xor_b32_e32 v6, v10, v14 1130; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v4 1131; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v8, v7 1132; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v3, v4 1133; TONGA-NEXT: v_add_u32_e32 v7, vcc, -1, v5 1134; TONGA-NEXT: v_add_u32_e32 v3, vcc, 1, v5 1135; TONGA-NEXT: s_and_b64 vcc, s[0:1], s[2:3] 1136; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 1137; TONGA-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] 1138; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 1139; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 1140; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1141; TONGA-NEXT: s_endpgm 1142; 1143; GFX9-LABEL: sdiv_v4i32: 1144; GFX9: ; %bb.0: 1145; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 1146; GFX9-NEXT: s_mov_b32 s15, 0xf000 1147; GFX9-NEXT: s_mov_b32 s14, -1 1148; GFX9-NEXT: s_mov_b32 s2, s14 1149; GFX9-NEXT: s_mov_b32 s3, s15 1150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX9-NEXT: s_mov_b32 s0, s10 1152; GFX9-NEXT: s_mov_b32 s1, s11 1153; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1154; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 1155; GFX9-NEXT: s_mov_b32 s4, 0x4f800000 1156; GFX9-NEXT: s_mov_b32 s12, s8 1157; GFX9-NEXT: s_mov_b32 s13, s9 1158; GFX9-NEXT: s_waitcnt vmcnt(1) 1159; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4 1160; GFX9-NEXT: s_waitcnt vmcnt(0) 1161; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 1162; GFX9-NEXT: v_add_u32_e32 v4, v4, v9 1163; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 1164; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9 1165; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9 1166; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 1167; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4 1168; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v5 1169; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 1170; GFX9-NEXT: v_xor_b32_e32 v5, v5, v11 1171; GFX9-NEXT: v_cvt_f32_u32_e32 v9, v5 1172; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 1173; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6 1174; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1 1175; GFX9-NEXT: v_add_u32_e32 v6, v6, v13 1176; GFX9-NEXT: v_add_u32_e32 v1, v1, v10 1177; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9 1178; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8 1179; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13 1180; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11 1181; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10 1182; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v6 1183; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 1184; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2 1185; GFX9-NEXT: v_add_u32_e32 v2, v2, v12 1186; GFX9-NEXT: v_mul_f32_e32 v9, s4, v9 1187; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 1188; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13 1189; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12 1190; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 1191; GFX9-NEXT: v_mul_hi_u32 v12, v8, v4 1192; GFX9-NEXT: v_mul_lo_u32 v11, v8, v4 1193; GFX9-NEXT: v_mul_f32_e32 v10, s4, v10 1194; GFX9-NEXT: v_mul_lo_u32 v13, v9, v5 1195; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 1196; GFX9-NEXT: v_mul_hi_u32 v12, v9, v5 1197; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 1198; GFX9-NEXT: v_sub_u32_e32 v19, 0, v11 1199; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc 1200; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v12 1201; GFX9-NEXT: v_sub_u32_e32 v19, 0, v13 1202; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[0:1] 1203; GFX9-NEXT: v_mul_hi_u32 v19, v10, v6 1204; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v7 1205; GFX9-NEXT: v_add_u32_e32 v7, v7, v15 1206; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15 1207; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v19 1208; GFX9-NEXT: v_cvt_f32_u32_e32 v19, v7 1209; GFX9-NEXT: v_mul_hi_u32 v11, v11, v8 1210; GFX9-NEXT: v_mul_lo_u32 v12, v10, v6 1211; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v3 1212; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v19 1213; GFX9-NEXT: v_add_u32_e32 v3, v3, v14 1214; GFX9-NEXT: v_sub_u32_e32 v20, 0, v12 1215; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v20, s[2:3] 1216; GFX9-NEXT: v_mul_f32_e32 v19, s4, v19 1217; GFX9-NEXT: v_cvt_u32_f32_e32 v19, v19 1218; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14 1219; GFX9-NEXT: v_mul_hi_u32 v21, v19, v7 1220; GFX9-NEXT: v_mul_lo_u32 v20, v19, v7 1221; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21 1222; GFX9-NEXT: v_add_u32_e32 v21, v8, v11 1223; GFX9-NEXT: v_sub_u32_e32 v8, v8, v11 1224; GFX9-NEXT: v_mul_hi_u32 v11, v13, v9 1225; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v21, vcc 1226; GFX9-NEXT: v_mul_hi_u32 v8, v8, v0 1227; GFX9-NEXT: v_sub_u32_e32 v22, 0, v20 1228; GFX9-NEXT: v_add_u32_e32 v13, v9, v11 1229; GFX9-NEXT: v_sub_u32_e32 v9, v9, v11 1230; GFX9-NEXT: v_mul_hi_u32 v11, v12, v10 1231; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[0:1] 1232; GFX9-NEXT: v_mul_hi_u32 v9, v9, v1 1233; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v22, s[4:5] 1234; GFX9-NEXT: v_add_u32_e32 v12, v10, v11 1235; GFX9-NEXT: v_sub_u32_e32 v10, v10, v11 1236; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[2:3] 1237; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4 1238; GFX9-NEXT: v_mul_hi_u32 v11, v20, v19 1239; GFX9-NEXT: v_mul_hi_u32 v10, v10, v2 1240; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 1241; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12 1242; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12 1243; GFX9-NEXT: v_mul_lo_u32 v12, v9, v5 1244; GFX9-NEXT: v_add_u32_e32 v20, v19, v11 1245; GFX9-NEXT: v_sub_u32_e32 v11, v19, v11 1246; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v20, s[4:5] 1247; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v12 1248; GFX9-NEXT: v_sub_u32_e32 v1, v1, v12 1249; GFX9-NEXT: v_mul_lo_u32 v12, v10, v6 1250; GFX9-NEXT: v_mul_hi_u32 v11, v11, v3 1251; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 1252; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 1253; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v12 1254; GFX9-NEXT: v_sub_u32_e32 v2, v2, v12 1255; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc 1256; GFX9-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 1257; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v13, s[0:1] 1258; GFX9-NEXT: v_add_u32_e32 v0, 1, v9 1259; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3] 1260; GFX9-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[0:1] 1261; GFX9-NEXT: v_add_u32_e32 v1, 1, v10 1262; GFX9-NEXT: s_and_b64 s[0:1], s[8:9], s[6:7] 1263; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7 1264; GFX9-NEXT: v_add_u32_e32 v19, -1, v8 1265; GFX9-NEXT: v_cndmask_b32_e64 v1, v10, v1, s[0:1] 1266; GFX9-NEXT: v_add_u32_e32 v5, -1, v10 1267; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc 1268; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[6:7] 1269; GFX9-NEXT: v_add_u32_e32 v4, -1, v9 1270; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[2:3] 1271; GFX9-NEXT: v_xor_b32_e32 v2, v2, v16 1272; GFX9-NEXT: v_xor_b32_e32 v5, v1, v18 1273; GFX9-NEXT: v_xor_b32_e32 v4, v0, v17 1274; GFX9-NEXT: v_sub_u32_e32 v0, v2, v16 1275; GFX9-NEXT: v_sub_u32_e32 v2, v5, v18 1276; GFX9-NEXT: v_sub_u32_e32 v5, v3, v12 1277; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v7 1278; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v12 1279; GFX9-NEXT: v_add_u32_e32 v3, 1, v11 1280; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1] 1281; GFX9-NEXT: v_add_u32_e32 v5, -1, v11 1282; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc 1283; GFX9-NEXT: v_sub_u32_e32 v1, v4, v17 1284; GFX9-NEXT: v_xor_b32_e32 v4, v14, v15 1285; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] 1286; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 1287; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 1288; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1289; GFX9-NEXT: s_endpgm 1290; 1291; EG-LABEL: sdiv_v4i32: 1292; EG: ; %bb.0: 1293; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 1294; EG-NEXT: TEX 0 @8 1295; EG-NEXT: ALU 2, @13, KC0[], KC1[] 1296; EG-NEXT: TEX 0 @10 1297; EG-NEXT: ALU 114, @16, KC0[CB0:0-32], KC1[] 1298; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 1299; EG-NEXT: CF_END 1300; EG-NEXT: PAD 1301; EG-NEXT: Fetch clause starting at 8: 1302; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 1303; EG-NEXT: Fetch clause starting at 10: 1304; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 1305; EG-NEXT: ALU clause starting at 12: 1306; EG-NEXT: MOV * T0.X, KC0[2].Z, 1307; EG-NEXT: ALU clause starting at 13: 1308; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Z, 1309; EG-NEXT: ADD_INT * T2.W, T1.Z, PV.W, 1310; EG-NEXT: XOR_INT * T2.W, PV.W, T0.W, 1311; EG-NEXT: ALU clause starting at 16: 1312; EG-NEXT: RECIP_UINT * T0.X, T2.W, 1313; EG-NEXT: MULLO_INT * T0.Y, PS, T2.W, 1314; EG-NEXT: SUB_INT T4.W, 0.0, PS, 1315; EG-NEXT: MULHI * T0.Z, T0.X, T2.W, 1316; EG-NEXT: CNDE_INT T4.W, PS, PV.W, T0.Y, 1317; EG-NEXT: SETGT_INT * T5.W, 0.0, T3.Z, 1318; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, 1319; EG-NEXT: SETGT_INT T2.Y, 0.0, T1.W, 1320; EG-NEXT: ADD_INT T1.Z, T3.Z, T5.W, BS:VEC_021/SCL_122 1321; EG-NEXT: ADD_INT T4.W, T0.X, PS, 1322; EG-NEXT: SUB_INT * T6.W, T0.X, PS, 1323; EG-NEXT: CNDE_INT T0.Z, T0.Z, PV.W, PS, 1324; EG-NEXT: XOR_INT T4.W, PV.Z, T5.W, 1325; EG-NEXT: ADD_INT * T1.W, T1.W, PV.Y, 1326; EG-NEXT: XOR_INT T1.W, PS, T2.Y, 1327; EG-NEXT: MULHI * T0.X, PV.Z, PV.W, 1328; EG-NEXT: SETGT_INT T6.W, 0.0, T1.Y, 1329; EG-NEXT: RECIP_UINT * T0.Y, PV.W, 1330; EG-NEXT: ADD_INT T7.W, T1.Y, PV.W, 1331; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, 1332; EG-NEXT: XOR_INT T1.Z, PV.W, T6.W, BS:VEC_021/SCL_122 1333; EG-NEXT: SUB_INT T7.W, 0.0, PS, 1334; EG-NEXT: MULHI * T1.Y, T0.Y, T1.W, 1335; EG-NEXT: CNDE_INT T7.W, PS, PV.W, T0.Z, 1336; EG-NEXT: RECIP_UINT * T0.Z, PV.Z, 1337; EG-NEXT: SETGT_INT T8.W, 0.0, T3.W, 1338; EG-NEXT: MULHI * T2.X, PV.W, T0.Y, 1339; EG-NEXT: ADD_INT T4.Y, T3.W, PV.W, 1340; EG-NEXT: ADD_INT T2.Z, T0.Y, PS, 1341; EG-NEXT: SUB_INT T3.W, T0.Y, PS, 1342; EG-NEXT: MULLO_INT * T0.Y, T0.Z, T1.Z, 1343; EG-NEXT: CNDE_INT T2.X, T1.Y, PV.Z, PV.W, 1344; EG-NEXT: XOR_INT T1.Y, PV.Y, T8.W, 1345; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.X, 1346; EG-NEXT: SUB_INT T3.W, 0.0, PS, 1347; EG-NEXT: MULHI * T3.Z, T0.Z, T1.Z, 1348; EG-NEXT: CNDE_INT T4.Z, PS, PV.W, T0.Y, 1349; EG-NEXT: ADD_INT T3.W, T1.X, PV.Z, 1350; EG-NEXT: MULHI * T0.Y, PV.X, PV.Y, 1351; EG-NEXT: XOR_INT T3.W, PV.W, T2.Z, BS:VEC_021/SCL_122 1352; EG-NEXT: MULHI * T1.X, PV.Z, T0.Z, 1353; EG-NEXT: RECIP_UINT * T2.X, PV.W, 1354; EG-NEXT: MULLO_INT * T4.X, PS, T3.W, 1355; EG-NEXT: SETGT_INT T4.Z, 0.0, T3.Y, 1356; EG-NEXT: SUB_INT T7.W, 0.0, PS, 1357; EG-NEXT: MULHI * T4.Y, T2.X, T3.W, 1358; EG-NEXT: CNDE_INT T4.X, PS, PV.W, T4.X, 1359; EG-NEXT: ADD_INT T3.Y, T3.Y, PV.Z, 1360; EG-NEXT: ADD_INT T5.Z, T0.Z, T1.X, 1361; EG-NEXT: SUB_INT T7.W, T0.Z, T1.X, 1362; EG-NEXT: MULLO_INT * T0.Z, T0.Y, T1.W, 1363; EG-NEXT: CNDE_INT T5.Y, T3.Z, PV.Z, PV.W, 1364; EG-NEXT: XOR_INT T3.Z, PV.Y, T4.Z, 1365; EG-NEXT: SUB_INT T7.W, T1.Y, PS, 1366; EG-NEXT: MULHI * T1.X, PV.X, T2.X, 1367; EG-NEXT: SETGE_UINT T5.Z, PV.W, T1.W, 1368; EG-NEXT: SETGE_UINT T1.W, T1.Y, T0.Z, 1369; EG-NEXT: MULHI * T0.Z, PV.Y, PV.Z, 1370; EG-NEXT: AND_INT T1.Y, PV.Z, PV.W, 1371; EG-NEXT: ADD_INT T5.Z, T0.Y, 1, 1372; EG-NEXT: SETGT_INT T7.W, 0.0, T3.X, 1373; EG-NEXT: MULLO_INT * T3.Y, PS, T1.Z, 1374; EG-NEXT: SUB_INT T4.X, T3.Z, PS, 1375; EG-NEXT: ADD_INT T5.Y, T3.X, PV.W, 1376; EG-NEXT: ADD_INT T6.Z, T2.X, T1.X, BS:VEC_120/SCL_212 1377; EG-NEXT: SUB_INT * T9.W, T2.X, T1.X, BS:VEC_120/SCL_212 1378; EG-NEXT: MULLO_INT * T1.X, T0.X, T2.W, 1379; EG-NEXT: CNDE_INT T2.X, T4.Y, T6.Z, T9.W, 1380; EG-NEXT: XOR_INT T4.Y, T5.Y, T7.W, BS:VEC_201 1381; EG-NEXT: SUB_INT T6.Z, T4.W, PS, BS:VEC_120/SCL_212 1382; EG-NEXT: SETGE_UINT T9.W, T4.X, T1.Z, BS:VEC_102/SCL_221 1383; EG-NEXT: SETGE_UINT * T10.W, T3.Z, T3.Y, 1384; EG-NEXT: AND_INT T3.X, PV.W, PS, 1385; EG-NEXT: ADD_INT T3.Y, T0.Z, 1, 1386; EG-NEXT: SETGE_UINT T1.Z, PV.Z, T2.W, 1387; EG-NEXT: SETGE_UINT T2.W, T4.W, T1.X, 1388; EG-NEXT: MULHI * T1.X, PV.X, PV.Y, 1389; EG-NEXT: AND_INT T2.X, PV.Z, PV.W, 1390; EG-NEXT: ADD_INT T5.Y, T0.X, 1, 1391; EG-NEXT: CNDE_INT T1.Z, PV.X, T0.Z, PV.Y, 1392; EG-NEXT: ADD_INT T4.W, T0.Z, literal.x, 1393; EG-NEXT: MULLO_INT * T0.Z, PS, T3.W, 1394; EG-NEXT: -1(nan), 0(0.000000e+00) 1395; EG-NEXT: CNDE_INT T3.X, T10.W, PV.W, PV.Z, 1396; EG-NEXT: CNDE_INT T3.Y, PV.X, T0.X, PV.Y, 1397; EG-NEXT: CNDE_INT T1.Z, T1.Y, T0.Y, T5.Z, 1398; EG-NEXT: ADD_INT T4.W, T0.Y, literal.x, BS:VEC_120/SCL_212 1399; EG-NEXT: SUB_INT * T9.W, T4.Y, PS, 1400; EG-NEXT: -1(nan), 0(0.000000e+00) 1401; EG-NEXT: ADD_INT T0.X, T0.X, literal.x, 1402; EG-NEXT: SETGE_UINT T0.Y, PS, T3.W, 1403; EG-NEXT: SETGE_UINT T0.Z, T4.Y, T0.Z, 1404; EG-NEXT: CNDE_INT T1.W, T1.W, PV.W, PV.Z, 1405; EG-NEXT: XOR_INT * T3.W, T8.W, T2.Y, 1406; EG-NEXT: -1(nan), 0(0.000000e+00) 1407; EG-NEXT: XOR_INT T2.X, PV.W, PS, 1408; EG-NEXT: AND_INT T0.Y, PV.Y, PV.Z, 1409; EG-NEXT: ADD_INT T1.Z, T1.X, 1, 1410; EG-NEXT: CNDE_INT T1.W, T2.W, PV.X, T3.Y, 1411; EG-NEXT: XOR_INT * T0.W, T5.W, T0.W, 1412; EG-NEXT: XOR_INT T0.X, T4.Z, T6.W, BS:VEC_021/SCL_122 1413; EG-NEXT: XOR_INT T1.Y, PV.W, PS, 1414; EG-NEXT: CNDE_INT T1.Z, PV.Y, T1.X, PV.Z, 1415; EG-NEXT: ADD_INT T1.W, T1.X, literal.x, 1416; EG-NEXT: SUB_INT * T3.W, PV.X, T3.W, 1417; EG-NEXT: -1(nan), 0(0.000000e+00) 1418; EG-NEXT: CNDE_INT T0.Y, T0.Z, PV.W, PV.Z, 1419; EG-NEXT: SUB_INT T3.Z, PV.Y, T0.W, 1420; EG-NEXT: XOR_INT T0.W, T7.W, T2.Z, 1421; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X, 1422; EG-NEXT: SUB_INT T3.Y, PS, T0.X, 1423; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W, 1424; EG-NEXT: SUB_INT T3.X, PV.W, T0.W, 1425; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1426; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1427 %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 1428 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in 1429 %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr 1430 %result = sdiv <4 x i32> %num, %den 1431 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 1432 ret void 1433} 1434 1435define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 1436; GCN-LABEL: sdiv_v4i32_4: 1437; GCN: ; %bb.0: 1438; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1439; GCN-NEXT: s_mov_b32 s7, 0xf000 1440; GCN-NEXT: s_mov_b32 s6, -1 1441; GCN-NEXT: s_mov_b32 s10, s6 1442; GCN-NEXT: s_mov_b32 s11, s7 1443; GCN-NEXT: s_waitcnt lgkmcnt(0) 1444; GCN-NEXT: s_mov_b32 s8, s2 1445; GCN-NEXT: s_mov_b32 s9, s3 1446; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1447; GCN-NEXT: s_mov_b32 s4, s0 1448; GCN-NEXT: s_mov_b32 s5, s1 1449; GCN-NEXT: s_waitcnt vmcnt(0) 1450; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1451; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1452; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1453; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1454; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1455; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1456; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1457; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1458; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 1459; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 1460; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 1461; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 1462; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1463; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1464; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1465; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1466; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1467; GCN-NEXT: s_endpgm 1468; 1469; TONGA-LABEL: sdiv_v4i32_4: 1470; TONGA: ; %bb.0: 1471; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1472; TONGA-NEXT: s_mov_b32 s3, 0xf000 1473; TONGA-NEXT: s_mov_b32 s2, -1 1474; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1475; TONGA-NEXT: s_mov_b32 s0, s4 1476; TONGA-NEXT: s_mov_b32 s1, s5 1477; TONGA-NEXT: s_mov_b32 s4, s6 1478; TONGA-NEXT: s_mov_b32 s5, s7 1479; TONGA-NEXT: s_mov_b32 s6, s2 1480; TONGA-NEXT: s_mov_b32 s7, s3 1481; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1482; TONGA-NEXT: s_waitcnt vmcnt(0) 1483; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1484; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1485; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1486; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1487; TONGA-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1488; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1489; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1490; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1491; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 1492; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1 1493; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 1494; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 1495; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1496; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1497; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1498; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1499; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1500; TONGA-NEXT: s_endpgm 1501; 1502; GFX9-LABEL: sdiv_v4i32_4: 1503; GFX9: ; %bb.0: 1504; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1505; GFX9-NEXT: s_mov_b32 s3, 0xf000 1506; GFX9-NEXT: s_mov_b32 s2, -1 1507; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX9-NEXT: s_mov_b32 s0, s4 1509; GFX9-NEXT: s_mov_b32 s1, s5 1510; GFX9-NEXT: s_mov_b32 s4, s6 1511; GFX9-NEXT: s_mov_b32 s5, s7 1512; GFX9-NEXT: s_mov_b32 s6, s2 1513; GFX9-NEXT: s_mov_b32 s7, s3 1514; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1515; GFX9-NEXT: s_waitcnt vmcnt(0) 1516; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1517; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1518; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1519; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1520; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1521; GFX9-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1522; GFX9-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1523; GFX9-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1524; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 1525; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 1526; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 1527; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 1528; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1529; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1530; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1531; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1532; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1533; GFX9-NEXT: s_endpgm 1534; 1535; EG-LABEL: sdiv_v4i32_4: 1536; EG: ; %bb.0: 1537; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1538; EG-NEXT: TEX 0 @6 1539; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[] 1540; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 1541; EG-NEXT: CF_END 1542; EG-NEXT: PAD 1543; EG-NEXT: Fetch clause starting at 6: 1544; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1545; EG-NEXT: ALU clause starting at 8: 1546; EG-NEXT: MOV * T0.X, KC0[2].Z, 1547; EG-NEXT: ALU clause starting at 9: 1548; EG-NEXT: ASHR T1.W, T0.W, literal.x, 1549; EG-NEXT: ASHR * T2.W, T0.Z, literal.x, 1550; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1551; EG-NEXT: LSHR * T1.W, PV.W, literal.x, 1552; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1553; EG-NEXT: ADD_INT T1.Z, T0.W, PV.W, 1554; EG-NEXT: LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212 1555; EG-NEXT: ASHR * T1.W, T0.Y, literal.y, 1556; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1557; EG-NEXT: LSHR T1.Y, PS, literal.x, 1558; EG-NEXT: ASHR T2.Z, T0.X, literal.y, 1559; EG-NEXT: ADD_INT T0.W, T0.Z, PV.W, 1560; EG-NEXT: ASHR * T1.W, PV.Z, literal.z, 1561; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1562; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1563; EG-NEXT: ASHR T1.Z, PV.W, literal.x, 1564; EG-NEXT: LSHR T0.W, PV.Z, literal.y, 1565; EG-NEXT: ADD_INT * T2.W, T0.Y, PV.Y, 1566; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44) 1567; EG-NEXT: ASHR T1.Y, PS, literal.x, 1568; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 1569; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1570; EG-NEXT: ASHR T1.X, PV.W, literal.x, 1571; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1572; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1573 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in 1574 %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4> 1575 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 1576 ret void 1577} 1578 1579define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 1580; GCN-LABEL: v_sdiv_i8: 1581; GCN: ; %bb.0: 1582; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1583; GCN-NEXT: s_mov_b32 s7, 0xf000 1584; GCN-NEXT: s_mov_b32 s6, -1 1585; GCN-NEXT: s_mov_b32 s10, s6 1586; GCN-NEXT: s_mov_b32 s11, s7 1587; GCN-NEXT: s_waitcnt lgkmcnt(0) 1588; GCN-NEXT: s_mov_b32 s8, s2 1589; GCN-NEXT: s_mov_b32 s9, s3 1590; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 1591; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1 1592; GCN-NEXT: s_mov_b32 s4, s0 1593; GCN-NEXT: s_mov_b32 s5, s1 1594; GCN-NEXT: s_waitcnt vmcnt(0) 1595; GCN-NEXT: v_xor_b32_e32 v2, v0, v1 1596; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 1597; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 1598; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 1599; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 1600; GCN-NEXT: v_or_b32_e32 v2, 1, v2 1601; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 1602; GCN-NEXT: v_trunc_f32_e32 v3, v3 1603; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0 1604; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 1605; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| 1606; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc 1607; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1608; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 1609; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1610; GCN-NEXT: s_endpgm 1611; 1612; TONGA-LABEL: v_sdiv_i8: 1613; TONGA: ; %bb.0: 1614; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1615; TONGA-NEXT: s_mov_b32 s3, 0xf000 1616; TONGA-NEXT: s_mov_b32 s2, -1 1617; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1618; TONGA-NEXT: s_mov_b32 s0, s4 1619; TONGA-NEXT: s_mov_b32 s1, s5 1620; TONGA-NEXT: s_mov_b32 s4, s6 1621; TONGA-NEXT: s_mov_b32 s5, s7 1622; TONGA-NEXT: s_mov_b32 s6, s2 1623; TONGA-NEXT: s_mov_b32 s7, s3 1624; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1 1625; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 1626; TONGA-NEXT: s_waitcnt vmcnt(1) 1627; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0 1628; TONGA-NEXT: s_waitcnt vmcnt(0) 1629; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2 1630; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 1631; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1632; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 1633; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1634; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 1635; TONGA-NEXT: v_trunc_f32_e32 v2, v2 1636; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 1637; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 1638; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1639; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1640; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1641; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 1642; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 1643; TONGA-NEXT: s_endpgm 1644; 1645; GFX9-LABEL: v_sdiv_i8: 1646; GFX9: ; %bb.0: 1647; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1648; GFX9-NEXT: s_mov_b32 s3, 0xf000 1649; GFX9-NEXT: s_mov_b32 s2, -1 1650; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1651; GFX9-NEXT: s_mov_b32 s0, s4 1652; GFX9-NEXT: s_mov_b32 s1, s5 1653; GFX9-NEXT: s_mov_b32 s4, s6 1654; GFX9-NEXT: s_mov_b32 s5, s7 1655; GFX9-NEXT: s_mov_b32 s6, s2 1656; GFX9-NEXT: s_mov_b32 s7, s3 1657; GFX9-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1 1658; GFX9-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 1659; GFX9-NEXT: s_waitcnt vmcnt(1) 1660; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0 1661; GFX9-NEXT: s_waitcnt vmcnt(0) 1662; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2 1663; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 1664; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1665; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 1666; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1667; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 1668; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1669; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 1670; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 1671; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 1672; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1673; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1674; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 1675; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1676; GFX9-NEXT: s_endpgm 1677; 1678; EG-LABEL: v_sdiv_i8: 1679; EG: ; %bb.0: 1680; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1681; EG-NEXT: TEX 1 @6 1682; EG-NEXT: ALU 21, @11, KC0[CB0:0-32], KC1[] 1683; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1684; EG-NEXT: CF_END 1685; EG-NEXT: PAD 1686; EG-NEXT: Fetch clause starting at 6: 1687; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 1688; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1689; EG-NEXT: ALU clause starting at 10: 1690; EG-NEXT: MOV * T0.X, KC0[2].Z, 1691; EG-NEXT: ALU clause starting at 11: 1692; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 1693; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1694; EG-NEXT: INT_TO_FLT * T0.Y, PV.W, 1695; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, literal.x, 1696; EG-NEXT: RECIP_IEEE * T0.X, PS, 1697; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1698; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1699; EG-NEXT: MUL_IEEE * T2.W, PS, T0.X, 1700; EG-NEXT: TRUNC T2.W, PV.W, 1701; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1702; EG-NEXT: ASHR T0.W, PS, literal.x, 1703; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z, 1704; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1705; EG-NEXT: TRUNC T0.Z, T2.W, 1706; EG-NEXT: SETGE T1.W, |PS|, |T0.Y|, 1707; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1708; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1709; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1710; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1711; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x, 1712; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1713; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 1714 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 1715 %num = load i8, i8 addrspace(1) * %in 1716 %den = load i8, i8 addrspace(1) * %den_ptr 1717 %result = sdiv i8 %num, %den 1718 %result.ext = sext i8 %result to i32 1719 store i32 %result.ext, i32 addrspace(1)* %out 1720 ret void 1721} 1722 1723define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { 1724; GCN-LABEL: v_sdiv_i23: 1725; GCN: ; %bb.0: 1726; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1727; GCN-NEXT: s_mov_b32 s7, 0xf000 1728; GCN-NEXT: s_mov_b32 s6, -1 1729; GCN-NEXT: s_mov_b32 s10, s6 1730; GCN-NEXT: s_mov_b32 s11, s7 1731; GCN-NEXT: s_waitcnt lgkmcnt(0) 1732; GCN-NEXT: s_mov_b32 s4, s0 1733; GCN-NEXT: s_mov_b32 s5, s1 1734; GCN-NEXT: s_mov_b32 s8, s2 1735; GCN-NEXT: s_mov_b32 s9, s3 1736; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 1737; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 1738; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1739; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1740; GCN-NEXT: s_waitcnt vmcnt(3) 1741; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1742; GCN-NEXT: s_waitcnt vmcnt(2) 1743; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1744; GCN-NEXT: s_waitcnt vmcnt(1) 1745; GCN-NEXT: v_or_b32_e32 v0, v0, v1 1746; GCN-NEXT: s_waitcnt vmcnt(0) 1747; GCN-NEXT: v_or_b32_e32 v1, v2, v3 1748; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1749; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23 1750; GCN-NEXT: v_xor_b32_e32 v2, v0, v1 1751; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 1752; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 1753; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 1754; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 1755; GCN-NEXT: v_or_b32_e32 v2, 1, v2 1756; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 1757; GCN-NEXT: v_trunc_f32_e32 v3, v3 1758; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0 1759; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 1760; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1| 1761; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc 1762; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1763; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1764; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1765; GCN-NEXT: s_endpgm 1766; 1767; TONGA-LABEL: v_sdiv_i23: 1768; TONGA: ; %bb.0: 1769; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1770; TONGA-NEXT: s_mov_b32 s3, 0xf000 1771; TONGA-NEXT: s_mov_b32 s2, -1 1772; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1773; TONGA-NEXT: s_mov_b32 s0, s4 1774; TONGA-NEXT: s_mov_b32 s1, s5 1775; TONGA-NEXT: s_mov_b32 s4, s6 1776; TONGA-NEXT: s_mov_b32 s5, s7 1777; TONGA-NEXT: s_mov_b32 s6, s2 1778; TONGA-NEXT: s_mov_b32 s7, s3 1779; TONGA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 1780; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 1781; TONGA-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6 1782; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0 1783; TONGA-NEXT: s_waitcnt vmcnt(3) 1784; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1785; TONGA-NEXT: s_waitcnt vmcnt(1) 1786; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1787; TONGA-NEXT: v_or_b32_e32 v1, v1, v2 1788; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23 1789; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 1790; TONGA-NEXT: s_waitcnt vmcnt(0) 1791; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 1792; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1793; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 1794; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 1795; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 1796; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1797; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1798; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 1799; TONGA-NEXT: v_trunc_f32_e32 v1, v1 1800; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 1801; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 1802; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1803; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1804; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1805; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1806; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 1807; TONGA-NEXT: s_endpgm 1808; 1809; GFX9-LABEL: v_sdiv_i23: 1810; GFX9: ; %bb.0: 1811; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1812; GFX9-NEXT: s_mov_b32 s3, 0xf000 1813; GFX9-NEXT: s_mov_b32 s2, -1 1814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX9-NEXT: s_mov_b32 s0, s4 1816; GFX9-NEXT: s_mov_b32 s1, s5 1817; GFX9-NEXT: s_mov_b32 s4, s6 1818; GFX9-NEXT: s_mov_b32 s5, s7 1819; GFX9-NEXT: s_mov_b32 s6, s2 1820; GFX9-NEXT: s_mov_b32 s7, s3 1821; GFX9-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 1822; GFX9-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 1823; GFX9-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6 1824; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 1825; GFX9-NEXT: s_waitcnt vmcnt(3) 1826; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1827; GFX9-NEXT: s_waitcnt vmcnt(1) 1828; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1829; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 1830; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23 1831; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 1832; GFX9-NEXT: s_waitcnt vmcnt(0) 1833; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 1834; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1835; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 1836; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 1837; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 1838; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1839; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1840; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 1841; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1842; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 1843; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 1844; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 1845; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1846; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1847; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1848; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1849; GFX9-NEXT: s_endpgm 1850; 1851; EG-LABEL: v_sdiv_i23: 1852; EG: ; %bb.0: 1853; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1854; EG-NEXT: TEX 3 @6 1855; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[] 1856; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1857; EG-NEXT: CF_END 1858; EG-NEXT: PAD 1859; EG-NEXT: Fetch clause starting at 6: 1860; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1861; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1862; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1863; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1864; EG-NEXT: ALU clause starting at 14: 1865; EG-NEXT: MOV * T0.X, KC0[2].Z, 1866; EG-NEXT: ALU clause starting at 15: 1867; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1868; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1869; EG-NEXT: OR_INT T0.W, T0.X, PV.W, 1870; EG-NEXT: LSHL * T1.W, T3.X, literal.x, 1871; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1872; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1873; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1874; EG-NEXT: ASHR T0.W, PV.W, literal.x, 1875; EG-NEXT: OR_INT * T1.W, T2.X, T1.W, 1876; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1877; EG-NEXT: LSHL T1.W, PS, literal.x, 1878; EG-NEXT: INT_TO_FLT * T0.X, PV.W, 1879; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1880; EG-NEXT: ASHR T1.W, PV.W, literal.x, 1881; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1882; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1883; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1884; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y, 1885; EG-NEXT: TRUNC T2.W, PV.W, 1886; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1887; EG-NEXT: ASHR T0.W, PS, literal.x, 1888; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z, 1889; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1890; EG-NEXT: TRUNC T0.Z, T2.W, 1891; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 1892; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1893; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1894; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1895; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1896; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1897; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1898; EG-NEXT: ASHR T0.X, PV.W, literal.x, 1899; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1900; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45) 1901 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 1902 %num = load i23, i23 addrspace(1) * %in 1903 %den = load i23, i23 addrspace(1) * %den_ptr 1904 %result = sdiv i23 %num, %den 1905 %result.ext = sext i23 %result to i32 1906 store i32 %result.ext, i32 addrspace(1)* %out 1907 ret void 1908} 1909 1910define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { 1911; GCN-LABEL: v_sdiv_i24: 1912; GCN: ; %bb.0: 1913; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1914; GCN-NEXT: s_mov_b32 s7, 0xf000 1915; GCN-NEXT: s_mov_b32 s6, -1 1916; GCN-NEXT: s_mov_b32 s10, s6 1917; GCN-NEXT: s_mov_b32 s11, s7 1918; GCN-NEXT: s_waitcnt lgkmcnt(0) 1919; GCN-NEXT: s_mov_b32 s4, s0 1920; GCN-NEXT: s_mov_b32 s5, s1 1921; GCN-NEXT: s_mov_b32 s8, s2 1922; GCN-NEXT: s_mov_b32 s9, s3 1923; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 1924; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 1925; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1926; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1927; GCN-NEXT: s_waitcnt vmcnt(3) 1928; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1929; GCN-NEXT: s_waitcnt vmcnt(2) 1930; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1931; GCN-NEXT: s_waitcnt vmcnt(1) 1932; GCN-NEXT: v_or_b32_e32 v0, v0, v1 1933; GCN-NEXT: s_waitcnt vmcnt(0) 1934; GCN-NEXT: v_or_b32_e32 v2, v2, v3 1935; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 1936; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1 1937; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 1938; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 1939; GCN-NEXT: v_or_b32_e32 v1, 1, v1 1940; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 1941; GCN-NEXT: v_mul_f32_e32 v3, v0, v3 1942; GCN-NEXT: v_trunc_f32_e32 v3, v3 1943; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0 1944; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 1945; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| 1946; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1947; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1948; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 1949; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1950; GCN-NEXT: s_endpgm 1951; 1952; TONGA-LABEL: v_sdiv_i24: 1953; TONGA: ; %bb.0: 1954; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1955; TONGA-NEXT: s_mov_b32 s3, 0xf000 1956; TONGA-NEXT: s_mov_b32 s2, -1 1957; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1958; TONGA-NEXT: s_mov_b32 s0, s4 1959; TONGA-NEXT: s_mov_b32 s1, s5 1960; TONGA-NEXT: s_mov_b32 s4, s6 1961; TONGA-NEXT: s_mov_b32 s5, s7 1962; TONGA-NEXT: s_mov_b32 s6, s2 1963; TONGA-NEXT: s_mov_b32 s7, s3 1964; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:2 1965; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 1966; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 offset:6 1967; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0 1968; TONGA-NEXT: s_waitcnt vmcnt(3) 1969; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1970; TONGA-NEXT: s_waitcnt vmcnt(1) 1971; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1972; TONGA-NEXT: v_or_b32_e32 v1, v1, v2 1973; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 1974; TONGA-NEXT: s_waitcnt vmcnt(0) 1975; TONGA-NEXT: v_or_b32_e32 v3, v3, v0 1976; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 1977; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 1978; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 1979; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1980; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1981; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 1982; TONGA-NEXT: v_trunc_f32_e32 v2, v2 1983; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 1984; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 1985; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1986; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1987; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1988; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 1989; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 1990; TONGA-NEXT: s_endpgm 1991; 1992; GFX9-LABEL: v_sdiv_i24: 1993; GFX9: ; %bb.0: 1994; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1995; GFX9-NEXT: s_mov_b32 s3, 0xf000 1996; GFX9-NEXT: s_mov_b32 s2, -1 1997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX9-NEXT: s_mov_b32 s0, s4 1999; GFX9-NEXT: s_mov_b32 s1, s5 2000; GFX9-NEXT: s_mov_b32 s4, s6 2001; GFX9-NEXT: s_mov_b32 s5, s7 2002; GFX9-NEXT: s_mov_b32 s6, s2 2003; GFX9-NEXT: s_mov_b32 s7, s3 2004; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 2005; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 2006; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 2007; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 2008; GFX9-NEXT: s_waitcnt vmcnt(2) 2009; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2010; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 2011; GFX9-NEXT: s_waitcnt vmcnt(0) 2012; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2013; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 2014; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 2015; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 2016; GFX9-NEXT: v_xor_b32_e32 v1, v1, v3 2017; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 2018; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 2019; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 2020; GFX9-NEXT: v_mul_f32_e32 v3, v0, v4 2021; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2022; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v3 2023; GFX9-NEXT: v_mad_f32 v0, -v3, v2, v0 2024; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2| 2025; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 2026; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 2027; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 2028; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2029; GFX9-NEXT: s_endpgm 2030; 2031; EG-LABEL: v_sdiv_i24: 2032; EG: ; %bb.0: 2033; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 2034; EG-NEXT: TEX 3 @6 2035; EG-NEXT: ALU 43, @15, KC0[CB0:0-32], KC1[] 2036; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2037; EG-NEXT: CF_END 2038; EG-NEXT: PAD 2039; EG-NEXT: Fetch clause starting at 6: 2040; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 2041; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 2042; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 2043; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 2044; EG-NEXT: ALU clause starting at 14: 2045; EG-NEXT: MOV * T0.X, KC0[2].Z, 2046; EG-NEXT: ALU clause starting at 15: 2047; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 2048; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2049; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2050; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2051; EG-NEXT: OR_INT * T0.W, T0.X, PV.W, 2052; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, 2053; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, 2054; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, 2055; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2056; EG-NEXT: BFE_INT T2.W, T3.X, 0.0, literal.x, 2057; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2058; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2059; EG-NEXT: LSHL T0.Z, PV.W, literal.x, 2060; EG-NEXT: SUB_INT T2.W, 0.0, PS, 2061; EG-NEXT: MULHI * T1.X, T0.X, T0.W, 2062; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2063; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y, 2064; EG-NEXT: OR_INT * T3.W, T2.X, PV.Z, 2065; EG-NEXT: SETGT_INT T4.W, 0.0, PS, 2066; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, 2067; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W, 2068; EG-NEXT: ADD_INT T2.W, T0.X, PS, 2069; EG-NEXT: SUB_INT * T3.W, T0.X, PS, 2070; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, 2071; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W, 2072; EG-NEXT: MULHI * T0.X, PV.W, PS, 2073; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2074; EG-NEXT: SUB_INT * T2.W, T3.W, PS, 2075; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W, 2076; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y, 2077; EG-NEXT: AND_INT T0.W, PV.W, PS, 2078; EG-NEXT: ADD_INT * T3.W, T0.X, 1, 2079; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS, 2080; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x, 2081; EG-NEXT: -1(nan), 0(0.000000e+00) 2082; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W, 2083; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W, 2084; EG-NEXT: XOR_INT * T0.W, PV.W, PS, 2085; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, 2086; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2087; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2088; EG-NEXT: ASHR T0.X, PV.W, literal.x, 2089; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2090; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 2091 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 2092 %num = load i24, i24 addrspace(1) * %in 2093 %den = load i24, i24 addrspace(1) * %den_ptr 2094 %result = sdiv i24 %num, %den 2095 %result.ext = sext i24 %result to i32 2096 store i32 %result.ext, i32 addrspace(1)* %out 2097 ret void 2098} 2099 2100define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { 2101; GCN-LABEL: v_sdiv_i25: 2102; GCN: ; %bb.0: 2103; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2104; GCN-NEXT: s_mov_b32 s7, 0xf000 2105; GCN-NEXT: s_mov_b32 s6, -1 2106; GCN-NEXT: s_mov_b32 s10, s6 2107; GCN-NEXT: s_mov_b32 s11, s7 2108; GCN-NEXT: s_waitcnt lgkmcnt(0) 2109; GCN-NEXT: s_mov_b32 s8, s2 2110; GCN-NEXT: s_mov_b32 s9, s3 2111; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2112; GCN-NEXT: s_mov_b32 s4, s0 2113; GCN-NEXT: s_mov_b32 s5, s1 2114; GCN-NEXT: s_waitcnt vmcnt(0) 2115; GCN-NEXT: v_bfe_i32 v2, v0, 0, 25 2116; GCN-NEXT: v_bfe_i32 v3, v1, 0, 25 2117; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 2118; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 2119; GCN-NEXT: v_xor_b32_e32 v4, v0, v1 2120; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2 2121; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 2122; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 2123; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 2124; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 2125; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 2126; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 2127; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2128; GCN-NEXT: v_mul_hi_u32 v3, v2, v1 2129; GCN-NEXT: v_mul_lo_u32 v5, v2, v1 2130; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5 2131; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 2132; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] 2133; GCN-NEXT: v_mul_hi_u32 v3, v3, v2 2134; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2 2135; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 2136; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2137; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 2138; GCN-NEXT: v_mul_lo_u32 v3, v2, v1 2139; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 2140; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2 2141; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0 2142; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 2143; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1 2144; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc 2145; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1] 2146; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc 2147; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 2148; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 2149; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 2150; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2151; GCN-NEXT: s_endpgm 2152; 2153; TONGA-LABEL: v_sdiv_i25: 2154; TONGA: ; %bb.0: 2155; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 2156; TONGA-NEXT: s_mov_b32 s7, 0xf000 2157; TONGA-NEXT: s_mov_b32 s6, -1 2158; TONGA-NEXT: s_mov_b32 s2, s6 2159; TONGA-NEXT: s_mov_b32 s3, s7 2160; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2161; TONGA-NEXT: s_mov_b32 s0, s10 2162; TONGA-NEXT: s_mov_b32 s1, s11 2163; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 2164; TONGA-NEXT: s_mov_b32 s4, s8 2165; TONGA-NEXT: s_mov_b32 s5, s9 2166; TONGA-NEXT: s_waitcnt vmcnt(0) 2167; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 2168; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 2169; TONGA-NEXT: v_add_u32_e32 v2, vcc, v1, v2 2170; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1 2171; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 2172; TONGA-NEXT: v_bfe_i32 v4, v0, 0, 25 2173; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 2174; TONGA-NEXT: v_add_u32_e32 v4, vcc, v0, v4 2175; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 2176; TONGA-NEXT: v_xor_b32_e32 v4, v4, v0 2177; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 2178; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 2179; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 2180; TONGA-NEXT: v_mul_lo_u32 v5, v3, v2 2181; TONGA-NEXT: v_mul_hi_u32 v6, v3, v2 2182; TONGA-NEXT: v_sub_u32_e32 v7, vcc, 0, v5 2183; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 2184; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] 2185; TONGA-NEXT: v_mul_hi_u32 v5, v5, v3 2186; TONGA-NEXT: v_add_u32_e32 v6, vcc, v5, v3 2187; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v5, v3 2188; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2189; TONGA-NEXT: v_mul_hi_u32 v3, v3, v4 2190; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 2191; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 2192; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v3 2193; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v1, v4 2194; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 2195; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v2 2196; TONGA-NEXT: s_and_b64 s[0:1], s[0:1], vcc 2197; TONGA-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[0:1] 2198; TONGA-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc 2199; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 2200; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 2201; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 2202; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 2203; TONGA-NEXT: s_endpgm 2204; 2205; GFX9-LABEL: v_sdiv_i25: 2206; GFX9: ; %bb.0: 2207; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2208; GFX9-NEXT: s_mov_b32 s7, 0xf000 2209; GFX9-NEXT: s_mov_b32 s6, -1 2210; GFX9-NEXT: s_mov_b32 s10, s6 2211; GFX9-NEXT: s_mov_b32 s11, s7 2212; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX9-NEXT: s_mov_b32 s8, s2 2214; GFX9-NEXT: s_mov_b32 s9, s3 2215; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2216; GFX9-NEXT: s_mov_b32 s4, s0 2217; GFX9-NEXT: s_mov_b32 s5, s1 2218; GFX9-NEXT: s_waitcnt vmcnt(0) 2219; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25 2220; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1 2221; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 2222; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 2223; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 2224; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 25 2225; GFX9-NEXT: v_bfe_i32 v0, v0, 24, 1 2226; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 2227; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 2228; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2229; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 2230; GFX9-NEXT: v_mul_hi_u32 v5, v3, v2 2231; GFX9-NEXT: v_sub_u32_e32 v7, 0, v4 2232; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 2233; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 2234; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3 2235; GFX9-NEXT: v_add_u32_e32 v5, v6, v0 2236; GFX9-NEXT: v_xor_b32_e32 v5, v5, v0 2237; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 2238; GFX9-NEXT: v_add_u32_e32 v6, v3, v4 2239; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 2240; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2241; GFX9-NEXT: v_mul_hi_u32 v3, v3, v5 2242; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 2243; GFX9-NEXT: v_add_u32_e32 v1, 1, v3 2244; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 2245; GFX9-NEXT: v_sub_u32_e32 v7, v5, v4 2246; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 2247; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v2 2248; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc 2249; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] 2250; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc 2251; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 2252; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 2253; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25 2254; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 2255; GFX9-NEXT: s_endpgm 2256; 2257; EG-LABEL: v_sdiv_i25: 2258; EG: ; %bb.0: 2259; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2260; EG-NEXT: TEX 1 @6 2261; EG-NEXT: ALU 41, @12, KC0[CB0:0-32], KC1[] 2262; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2263; EG-NEXT: CF_END 2264; EG-NEXT: PAD 2265; EG-NEXT: Fetch clause starting at 6: 2266; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 2267; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 2268; EG-NEXT: ALU clause starting at 10: 2269; EG-NEXT: MOV * T0.X, KC0[2].Z, 2270; EG-NEXT: MOV * T1.X, PV.X, 2271; EG-NEXT: ALU clause starting at 12: 2272; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2273; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2274; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 2275; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2276; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, 2277; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, 2278; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, 2279; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2280; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2281; EG-NEXT: LSHL T0.Z, T1.X, literal.x, 2282; EG-NEXT: SUB_INT T2.W, 0.0, PS, 2283; EG-NEXT: MULHI * T1.X, T0.X, T0.W, 2284; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2285; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y, 2286; EG-NEXT: ASHR * T3.W, PV.Z, literal.x, 2287; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2288; EG-NEXT: SETGT_INT T4.W, 0.0, PS, 2289; EG-NEXT: MULHI * T0.Y, PV.W, T0.X, 2290; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W, 2291; EG-NEXT: ADD_INT T2.W, T0.X, PS, 2292; EG-NEXT: SUB_INT * T3.W, T0.X, PS, 2293; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS, 2294; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W, 2295; EG-NEXT: MULHI * T0.X, PV.W, PS, 2296; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2297; EG-NEXT: SUB_INT * T2.W, T3.W, PS, 2298; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W, 2299; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y, 2300; EG-NEXT: AND_INT T0.W, PV.W, PS, 2301; EG-NEXT: ADD_INT * T3.W, T0.X, 1, 2302; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS, 2303; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x, 2304; EG-NEXT: -1(nan), 0(0.000000e+00) 2305; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W, 2306; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W, 2307; EG-NEXT: XOR_INT * T0.W, PV.W, PS, 2308; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, 2309; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2310; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2311; EG-NEXT: ASHR T0.X, PV.W, literal.x, 2312; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2313; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45) 2314 %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1 2315 %num = load i25, i25 addrspace(1) * %in 2316 %den = load i25, i25 addrspace(1) * %den_ptr 2317 %result = sdiv i25 %num, %den 2318 %result.ext = sext i25 %result to i32 2319 store i32 %result.ext, i32 addrspace(1)* %out 2320 ret void 2321} 2322 2323; Tests for 64-bit divide bypass. 2324; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2325; %result = sdiv i64 %a, %b 2326; store i64 %result, i64 addrspace(1)* %out, align 8 2327; ret void 2328; } 2329 2330; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2331; %result = srem i64 %a, %b 2332; store i64 %result, i64 addrspace(1)* %out, align 8 2333; ret void 2334; } 2335 2336; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2337; %resultdiv = sdiv i64 %a, %b 2338; %resultrem = srem i64 %a, %b 2339; %result = add i64 %resultdiv, %resultrem 2340; store i64 %result, i64 addrspace(1)* %out, align 8 2341; ret void 2342; } 2343 2344define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { 2345; GCN-LABEL: scalarize_mulhs_4xi32: 2346; GCN: ; %bb.0: 2347; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2348; GCN-NEXT: s_mov_b32 s7, 0xf000 2349; GCN-NEXT: s_mov_b32 s6, -1 2350; GCN-NEXT: s_waitcnt lgkmcnt(0) 2351; GCN-NEXT: s_mov_b32 s4, s0 2352; GCN-NEXT: s_mov_b32 s5, s1 2353; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2354; GCN-NEXT: s_mov_b32 s0, 0x1389c755 2355; GCN-NEXT: s_mov_b32 s4, s2 2356; GCN-NEXT: s_mov_b32 s5, s3 2357; GCN-NEXT: s_waitcnt vmcnt(0) 2358; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 2359; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 2360; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 2361; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 2362; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2363; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2364; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2365; GCN-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2366; GCN-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2367; GCN-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2368; GCN-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2369; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2370; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 2371; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 2372; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 2373; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2374; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2375; GCN-NEXT: s_endpgm 2376; 2377; TONGA-LABEL: scalarize_mulhs_4xi32: 2378; TONGA: ; %bb.0: 2379; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2380; TONGA-NEXT: s_mov_b32 s3, 0xf000 2381; TONGA-NEXT: s_mov_b32 s2, -1 2382; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2383; TONGA-NEXT: s_mov_b32 s0, s4 2384; TONGA-NEXT: s_mov_b32 s1, s5 2385; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 2386; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 2387; TONGA-NEXT: s_mov_b32 s4, s6 2388; TONGA-NEXT: s_mov_b32 s5, s7 2389; TONGA-NEXT: s_mov_b32 s6, s2 2390; TONGA-NEXT: s_mov_b32 s7, s3 2391; TONGA-NEXT: s_waitcnt vmcnt(0) 2392; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 2393; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 2394; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 2395; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 2396; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2397; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2398; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2399; TONGA-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2400; TONGA-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2401; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2402; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2403; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2404; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 2405; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 2406; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 2407; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 2408; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2409; TONGA-NEXT: s_endpgm 2410; 2411; GFX9-LABEL: scalarize_mulhs_4xi32: 2412; GFX9: ; %bb.0: 2413; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2414; GFX9-NEXT: s_mov_b32 s3, 0xf000 2415; GFX9-NEXT: s_mov_b32 s2, -1 2416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2417; GFX9-NEXT: s_mov_b32 s0, s4 2418; GFX9-NEXT: s_mov_b32 s1, s5 2419; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 2420; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 2421; GFX9-NEXT: s_mov_b32 s4, s6 2422; GFX9-NEXT: s_mov_b32 s5, s7 2423; GFX9-NEXT: s_mov_b32 s6, s2 2424; GFX9-NEXT: s_mov_b32 s7, s3 2425; GFX9-NEXT: s_waitcnt vmcnt(0) 2426; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 2427; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 2428; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 2429; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 2430; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2431; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2432; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2433; GFX9-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2434; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2435; GFX9-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2436; GFX9-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2437; GFX9-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2438; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 2439; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 2440; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 2441; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 2442; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2443; GFX9-NEXT: s_endpgm 2444; 2445; EG-LABEL: scalarize_mulhs_4xi32: 2446; EG: ; %bb.0: 2447; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 2448; EG-NEXT: TEX 0 @6 2449; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[] 2450; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2451; EG-NEXT: CF_END 2452; EG-NEXT: PAD 2453; EG-NEXT: Fetch clause starting at 6: 2454; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2455; EG-NEXT: ALU clause starting at 8: 2456; EG-NEXT: MOV * T0.X, KC0[2].Y, 2457; EG-NEXT: ALU clause starting at 9: 2458; EG-NEXT: MULHI_INT * T0.W, T0.W, literal.x, 2459; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2460; EG-NEXT: ASHR T1.Z, PS, literal.x, 2461; EG-NEXT: LSHR T0.W, PS, literal.y, 2462; EG-NEXT: MULHI_INT * T0.Z, T0.Z, literal.z, 2463; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2464; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2465; EG-NEXT: ASHR T1.Y, PS, literal.x, 2466; EG-NEXT: LSHR T0.Z, PS, literal.y, 2467; EG-NEXT: ADD_INT T0.W, PV.Z, PV.W, 2468; EG-NEXT: MULHI_INT * T0.Y, T0.Y, literal.z, 2469; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2470; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2471; EG-NEXT: ASHR T2.Y, PS, literal.x, 2472; EG-NEXT: ADD_INT T0.Z, PV.Y, PV.Z, 2473; EG-NEXT: LSHR T1.W, PS, literal.y, 2474; EG-NEXT: MULHI_INT * T0.X, T0.X, literal.z, 2475; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2476; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2477; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.W, 2478; EG-NEXT: ASHR T1.W, PS, literal.x, 2479; EG-NEXT: LSHR * T2.W, PS, literal.y, 2480; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2481; EG-NEXT: ADD_INT T0.X, PV.W, PS, 2482; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, 2483; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2484 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 2485 %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668> 2486 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 2487 ret void 2488} 2489