1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefix=GCN 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=TONGA 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG 6 7; The code generated by sdiv is long and complex and may frequently change. 8; The goal of this test is to make sure the ISel doesn't fail. 9; 10; This program was previously failing to compile when one of the selectcc 11; opcodes generated by the sdiv lowering was being legalized and optimized to: 12; selectcc Remainder -1, 0, -1, SETGT 13; This was fixed by adding an additional pattern in R600Instructions.td to 14; match this pattern with a CNDGE_INT. 15 16define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 17; GCN-LABEL: sdiv_i32: 18; GCN: ; %bb.0: 19; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 20; GCN-NEXT: s_mov_b32 s7, 0xf000 21; GCN-NEXT: s_mov_b32 s6, -1 22; GCN-NEXT: s_mov_b32 s10, s6 23; GCN-NEXT: s_mov_b32 s11, s7 24; GCN-NEXT: s_waitcnt lgkmcnt(0) 25; GCN-NEXT: s_mov_b32 s8, s2 26; GCN-NEXT: s_mov_b32 s9, s3 27; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 28; GCN-NEXT: s_mov_b32 s4, s0 29; GCN-NEXT: s_mov_b32 s5, s1 30; GCN-NEXT: s_waitcnt vmcnt(0) 31; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 32; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 33; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 34; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 35; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 36; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 37; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 38; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 39; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 40; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 41; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 42; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 43; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 44; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 45; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 46; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 47; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 48; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 49; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v4, v0 50; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 51; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 52; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 53; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 54; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 55; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 56; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 57; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 58; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 59; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 60; GCN-NEXT: s_endpgm 61; 62; TONGA-LABEL: sdiv_i32: 63; TONGA: ; %bb.0: 64; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 65; TONGA-NEXT: s_mov_b32 s7, 0xf000 66; TONGA-NEXT: s_mov_b32 s6, -1 67; TONGA-NEXT: s_mov_b32 s10, s6 68; TONGA-NEXT: s_mov_b32 s11, s7 69; TONGA-NEXT: s_waitcnt lgkmcnt(0) 70; TONGA-NEXT: s_mov_b32 s8, s2 71; TONGA-NEXT: s_mov_b32 s9, s3 72; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 73; TONGA-NEXT: s_mov_b32 s4, s0 74; TONGA-NEXT: s_mov_b32 s5, s1 75; TONGA-NEXT: s_waitcnt vmcnt(0) 76; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 77; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 78; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2 79; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 80; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 81; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 82; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 83; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0 84; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5 85; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 86; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 87; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2 88; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 89; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 90; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 91; TONGA-NEXT: v_mul_hi_u32 v3, v0, v3 92; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 93; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 94; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v4, v0 95; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 96; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 97; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 98; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 99; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 100; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 101; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 102; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 103; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 104; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 105; TONGA-NEXT: s_endpgm 106; 107; GFX9-LABEL: sdiv_i32: 108; GFX9: ; %bb.0: 109; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 110; GFX9-NEXT: s_mov_b32 s7, 0xf000 111; GFX9-NEXT: s_mov_b32 s6, -1 112; GFX9-NEXT: s_mov_b32 s10, s6 113; GFX9-NEXT: s_mov_b32 s11, s7 114; GFX9-NEXT: s_waitcnt lgkmcnt(0) 115; GFX9-NEXT: s_mov_b32 s8, s2 116; GFX9-NEXT: s_mov_b32 s9, s3 117; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 118; GFX9-NEXT: s_mov_b32 s4, s0 119; GFX9-NEXT: s_mov_b32 s5, s1 120; GFX9-NEXT: s_waitcnt vmcnt(0) 121; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 122; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 123; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 124; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1 125; GFX9-NEXT: v_sub_u32_e32 v4, 0, v1 126; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 127; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 128; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 129; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 130; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2 131; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 132; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 133; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 134; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 135; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 136; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 137; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 138; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 139; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 140; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 141; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1 142; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 143; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 144; GFX9-NEXT: v_add_u32_e32 v4, 1, v3 145; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 146; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 147; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 148; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 149; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 150; GFX9-NEXT: s_endpgm 151; 152; EG-LABEL: sdiv_i32: 153; EG: ; %bb.0: 154; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 155; EG-NEXT: TEX 0 @6 156; EG-NEXT: ALU 26, @9, KC0[CB0:0-32], KC1[] 157; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 158; EG-NEXT: CF_END 159; EG-NEXT: PAD 160; EG-NEXT: Fetch clause starting at 6: 161; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 162; EG-NEXT: ALU clause starting at 8: 163; EG-NEXT: MOV * T0.X, KC0[2].Z, 164; EG-NEXT: ALU clause starting at 9: 165; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y, 166; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W, 167; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, 168; EG-NEXT: SUB_INT T2.W, 0.0, PV.W, 169; EG-NEXT: RECIP_UINT * T0.Y, PV.W, 170; EG-NEXT: SETGT_INT T3.W, 0.0, T0.X, 171; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, 172; EG-NEXT: ADD_INT T2.W, T0.X, PV.W, 173; EG-NEXT: MULHI * T0.X, T0.Y, PS, 174; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 175; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, 176; EG-NEXT: MULHI * T0.X, PS, PV.W, 177; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W, 178; EG-NEXT: SUB_INT * T2.W, T2.W, PS, 179; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 180; EG-NEXT: SETGE_UINT T4.W, PV.W, T1.W, 181; EG-NEXT: SUB_INT * T5.W, PV.W, T1.W, 182; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, 183; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, 184; EG-NEXT: ADD_INT T5.W, PS, 1, 185; EG-NEXT: SETGE_UINT * T1.W, PV.W, T1.W, 186; EG-NEXT: CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 187; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W, 188; EG-NEXT: XOR_INT * T1.W, PV.W, PS, 189; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 190; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 191; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 192 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 193 %num = load i32, i32 addrspace(1) * %in 194 %den = load i32, i32 addrspace(1) * %den_ptr 195 %result = sdiv i32 %num, %den 196 store i32 %result, i32 addrspace(1)* %out 197 ret void 198} 199 200define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 201; GCN-LABEL: sdiv_i32_4: 202; GCN: ; %bb.0: 203; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 204; GCN-NEXT: s_mov_b32 s7, 0xf000 205; GCN-NEXT: s_mov_b32 s6, -1 206; GCN-NEXT: s_mov_b32 s10, s6 207; GCN-NEXT: s_mov_b32 s11, s7 208; GCN-NEXT: s_waitcnt lgkmcnt(0) 209; GCN-NEXT: s_mov_b32 s8, s2 210; GCN-NEXT: s_mov_b32 s9, s3 211; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 212; GCN-NEXT: s_mov_b32 s4, s0 213; GCN-NEXT: s_mov_b32 s5, s1 214; GCN-NEXT: s_waitcnt vmcnt(0) 215; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 216; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 217; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 218; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 219; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 220; GCN-NEXT: s_endpgm 221; 222; TONGA-LABEL: sdiv_i32_4: 223; TONGA: ; %bb.0: 224; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 225; TONGA-NEXT: s_mov_b32 s7, 0xf000 226; TONGA-NEXT: s_mov_b32 s6, -1 227; TONGA-NEXT: s_mov_b32 s10, s6 228; TONGA-NEXT: s_mov_b32 s11, s7 229; TONGA-NEXT: s_waitcnt lgkmcnt(0) 230; TONGA-NEXT: s_mov_b32 s8, s2 231; TONGA-NEXT: s_mov_b32 s9, s3 232; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 233; TONGA-NEXT: s_mov_b32 s4, s0 234; TONGA-NEXT: s_mov_b32 s5, s1 235; TONGA-NEXT: s_waitcnt vmcnt(0) 236; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 237; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 238; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 239; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 240; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 241; TONGA-NEXT: s_endpgm 242; 243; GFX9-LABEL: sdiv_i32_4: 244; GFX9: ; %bb.0: 245; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 246; GFX9-NEXT: s_mov_b32 s7, 0xf000 247; GFX9-NEXT: s_mov_b32 s6, -1 248; GFX9-NEXT: s_mov_b32 s10, s6 249; GFX9-NEXT: s_mov_b32 s11, s7 250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 251; GFX9-NEXT: s_mov_b32 s8, s2 252; GFX9-NEXT: s_mov_b32 s9, s3 253; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 254; GFX9-NEXT: s_mov_b32 s4, s0 255; GFX9-NEXT: s_mov_b32 s5, s1 256; GFX9-NEXT: s_waitcnt vmcnt(0) 257; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 258; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 259; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 260; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 261; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 262; GFX9-NEXT: s_endpgm 263; 264; EG-LABEL: sdiv_i32_4: 265; EG: ; %bb.0: 266; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 267; EG-NEXT: TEX 0 @6 268; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] 269; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 270; EG-NEXT: CF_END 271; EG-NEXT: PAD 272; EG-NEXT: Fetch clause starting at 6: 273; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 274; EG-NEXT: ALU clause starting at 8: 275; EG-NEXT: MOV * T0.X, KC0[2].Z, 276; EG-NEXT: ALU clause starting at 9: 277; EG-NEXT: ASHR * T0.W, T0.X, literal.x, 278; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 279; EG-NEXT: LSHR * T0.W, PV.W, literal.x, 280; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 281; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 282; EG-NEXT: ASHR T0.X, PV.W, literal.x, 283; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 284; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 285 %num = load i32, i32 addrspace(1) * %in 286 %result = sdiv i32 %num, 4 287 store i32 %result, i32 addrspace(1)* %out 288 ret void 289} 290 291; Multiply by a weird constant to make sure setIntDivIsCheap is 292; working. 293 294define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 295; GCN-LABEL: slow_sdiv_i32_3435: 296; GCN: ; %bb.0: 297; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 298; GCN-NEXT: s_mov_b32 s7, 0xf000 299; GCN-NEXT: s_mov_b32 s6, -1 300; GCN-NEXT: s_mov_b32 s10, s6 301; GCN-NEXT: s_mov_b32 s11, s7 302; GCN-NEXT: s_waitcnt lgkmcnt(0) 303; GCN-NEXT: s_mov_b32 s8, s2 304; GCN-NEXT: s_mov_b32 s9, s3 305; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 306; GCN-NEXT: s_mov_b32 s2, 0x98a1930b 307; GCN-NEXT: s_mov_b32 s4, s0 308; GCN-NEXT: s_mov_b32 s5, s1 309; GCN-NEXT: s_waitcnt vmcnt(0) 310; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 311; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 312; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 313; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 314; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 315; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 316; GCN-NEXT: s_endpgm 317; 318; TONGA-LABEL: slow_sdiv_i32_3435: 319; TONGA: ; %bb.0: 320; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 321; TONGA-NEXT: s_mov_b32 s7, 0xf000 322; TONGA-NEXT: s_mov_b32 s6, -1 323; TONGA-NEXT: s_mov_b32 s10, s6 324; TONGA-NEXT: s_mov_b32 s11, s7 325; TONGA-NEXT: s_waitcnt lgkmcnt(0) 326; TONGA-NEXT: s_mov_b32 s8, s2 327; TONGA-NEXT: s_mov_b32 s9, s3 328; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 329; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b 330; TONGA-NEXT: s_mov_b32 s4, s0 331; TONGA-NEXT: s_mov_b32 s5, s1 332; TONGA-NEXT: s_waitcnt vmcnt(0) 333; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2 334; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 335; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 336; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 337; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 338; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 339; TONGA-NEXT: s_endpgm 340; 341; GFX9-LABEL: slow_sdiv_i32_3435: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 344; GFX9-NEXT: s_mov_b32 s7, 0xf000 345; GFX9-NEXT: s_mov_b32 s6, -1 346; GFX9-NEXT: s_mov_b32 s10, s6 347; GFX9-NEXT: s_mov_b32 s11, s7 348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 349; GFX9-NEXT: s_mov_b32 s8, s2 350; GFX9-NEXT: s_mov_b32 s9, s3 351; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 352; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b 353; GFX9-NEXT: s_mov_b32 s4, s0 354; GFX9-NEXT: s_mov_b32 s5, s1 355; GFX9-NEXT: s_waitcnt vmcnt(0) 356; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 357; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 358; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 359; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 360; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 361; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 362; GFX9-NEXT: s_endpgm 363; 364; EG-LABEL: slow_sdiv_i32_3435: 365; EG: ; %bb.0: 366; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 367; EG-NEXT: TEX 0 @6 368; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] 369; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 370; EG-NEXT: CF_END 371; EG-NEXT: PAD 372; EG-NEXT: Fetch clause starting at 6: 373; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 374; EG-NEXT: ALU clause starting at 8: 375; EG-NEXT: MOV * T0.X, KC0[2].Z, 376; EG-NEXT: ALU clause starting at 9: 377; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 378; EG-NEXT: -1734241525(-4.176600e-24), 0(0.000000e+00) 379; EG-NEXT: ADD_INT * T0.W, PS, T0.X, 380; EG-NEXT: ASHR T1.W, PV.W, literal.x, 381; EG-NEXT: LSHR * T0.W, PV.W, literal.y, 382; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44) 383; EG-NEXT: ADD_INT T0.X, PV.W, PS, 384; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 385; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 386 %num = load i32, i32 addrspace(1) * %in 387 %result = sdiv i32 %num, 3435 388 store i32 %result, i32 addrspace(1)* %out 389 ret void 390} 391 392define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 393; GCN-LABEL: sdiv_v2i32: 394; GCN: ; %bb.0: 395; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 396; GCN-NEXT: s_mov_b32 s7, 0xf000 397; GCN-NEXT: s_mov_b32 s6, -1 398; GCN-NEXT: s_mov_b32 s10, s6 399; GCN-NEXT: s_mov_b32 s11, s7 400; GCN-NEXT: s_waitcnt lgkmcnt(0) 401; GCN-NEXT: s_mov_b32 s8, s2 402; GCN-NEXT: s_mov_b32 s9, s3 403; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 404; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe 405; GCN-NEXT: s_mov_b32 s4, s0 406; GCN-NEXT: s_mov_b32 s5, s1 407; GCN-NEXT: s_waitcnt vmcnt(0) 408; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 409; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 410; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 411; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 412; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 413; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 414; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 415; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 416; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 417; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 418; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 419; GCN-NEXT: v_cvt_f32_u32_e32 v7, v3 420; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 421; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 422; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 423; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 424; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 425; GCN-NEXT: v_mul_f32_e32 v7, s2, v7 426; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 427; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 428; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 429; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 430; GCN-NEXT: v_mul_lo_u32 v11, v11, v7 431; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 432; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 433; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 434; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 435; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 436; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 437; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 438; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 439; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 440; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 441; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 442; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 443; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0 444; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 445; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 446; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 447; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 448; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 449; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 450; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] 451; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 452; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 453; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 454; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 455; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v5 456; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 457; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 458; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 459; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 460; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 461; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 462; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 463; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 464; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 465; GCN-NEXT: s_endpgm 466; 467; TONGA-LABEL: sdiv_v2i32: 468; TONGA: ; %bb.0: 469; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 470; TONGA-NEXT: s_mov_b32 s7, 0xf000 471; TONGA-NEXT: s_mov_b32 s6, -1 472; TONGA-NEXT: s_mov_b32 s10, s6 473; TONGA-NEXT: s_mov_b32 s11, s7 474; TONGA-NEXT: s_waitcnt lgkmcnt(0) 475; TONGA-NEXT: s_mov_b32 s8, s2 476; TONGA-NEXT: s_mov_b32 s9, s3 477; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 478; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe 479; TONGA-NEXT: s_mov_b32 s4, s0 480; TONGA-NEXT: s_mov_b32 s5, s1 481; TONGA-NEXT: s_waitcnt vmcnt(0) 482; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 483; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 484; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2 485; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 486; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 487; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 488; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 489; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 490; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5 491; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7 492; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2 493; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3 494; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 495; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 496; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 497; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 498; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5 499; TONGA-NEXT: v_mul_f32_e32 v7, s2, v7 500; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 501; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 502; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 503; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 504; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7 505; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 506; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 507; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 508; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 509; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 510; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 511; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 512; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 513; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 514; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 515; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 516; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 517; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v6, v0 518; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1 519; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 520; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 521; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 522; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 523; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 524; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] 525; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 526; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 527; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 528; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 529; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v5 530; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 531; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 532; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 533; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 534; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 535; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 536; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 537; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 538; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 539; TONGA-NEXT: s_endpgm 540; 541; GFX9-LABEL: sdiv_v2i32: 542; GFX9: ; %bb.0: 543; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 544; GFX9-NEXT: s_mov_b32 s7, 0xf000 545; GFX9-NEXT: s_mov_b32 s6, -1 546; GFX9-NEXT: s_mov_b32 s10, s6 547; GFX9-NEXT: s_mov_b32 s11, s7 548; GFX9-NEXT: s_waitcnt lgkmcnt(0) 549; GFX9-NEXT: s_mov_b32 s8, s2 550; GFX9-NEXT: s_mov_b32 s9, s3 551; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 552; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 553; GFX9-NEXT: s_mov_b32 s4, s0 554; GFX9-NEXT: s_mov_b32 s5, s1 555; GFX9-NEXT: s_waitcnt vmcnt(0) 556; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v2 557; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v3 558; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 559; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 560; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 561; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5 562; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v2 563; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v3 564; GFX9-NEXT: v_sub_u32_e32 v10, 0, v2 565; GFX9-NEXT: v_sub_u32_e32 v11, 0, v3 566; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 567; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 568; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 569; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1 570; GFX9-NEXT: v_mul_f32_e32 v6, s2, v6 571; GFX9-NEXT: v_mul_f32_e32 v7, s2, v7 572; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 573; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 574; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 575; GFX9-NEXT: v_add_u32_e32 v1, v1, v9 576; GFX9-NEXT: v_mul_lo_u32 v10, v10, v6 577; GFX9-NEXT: v_mul_lo_u32 v11, v11, v7 578; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 579; GFX9-NEXT: v_xor_b32_e32 v1, v1, v9 580; GFX9-NEXT: v_mul_hi_u32 v10, v6, v10 581; GFX9-NEXT: v_mul_hi_u32 v11, v7, v11 582; GFX9-NEXT: v_xor_b32_e32 v4, v8, v4 583; GFX9-NEXT: v_xor_b32_e32 v5, v9, v5 584; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 585; GFX9-NEXT: v_add_u32_e32 v7, v7, v11 586; GFX9-NEXT: v_mul_hi_u32 v6, v0, v6 587; GFX9-NEXT: v_mul_hi_u32 v7, v1, v7 588; GFX9-NEXT: v_mul_lo_u32 v8, v6, v2 589; GFX9-NEXT: v_mul_lo_u32 v9, v7, v3 590; GFX9-NEXT: v_add_u32_e32 v10, 1, v6 591; GFX9-NEXT: v_add_u32_e32 v11, 1, v7 592; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 593; GFX9-NEXT: v_sub_u32_e32 v1, v1, v9 594; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 595; GFX9-NEXT: v_sub_u32_e32 v8, v0, v2 596; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc 597; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v3 598; GFX9-NEXT: v_sub_u32_e32 v9, v1, v3 599; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc 600; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[0:1] 601; GFX9-NEXT: v_add_u32_e32 v8, 1, v6 602; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] 603; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 604; GFX9-NEXT: v_add_u32_e32 v9, 1, v7 605; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc 606; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 607; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc 608; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 609; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5 610; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 611; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 612; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 613; GFX9-NEXT: s_endpgm 614; 615; EG-LABEL: sdiv_v2i32: 616; EG: ; %bb.0: 617; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 618; EG-NEXT: TEX 1 @6 619; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[] 620; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 621; EG-NEXT: CF_END 622; EG-NEXT: PAD 623; EG-NEXT: Fetch clause starting at 6: 624; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 625; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 626; EG-NEXT: ALU clause starting at 10: 627; EG-NEXT: MOV * T0.X, KC0[2].Z, 628; EG-NEXT: ALU clause starting at 11: 629; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Y, 630; EG-NEXT: ADD_INT T1.W, T1.Y, PV.W, 631; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.X, 632; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, 633; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W, 634; EG-NEXT: ADD_INT T3.W, T1.X, T2.W, 635; EG-NEXT: RECIP_UINT * T1.X, PV.W, 636; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, 637; EG-NEXT: MULLO_INT * T0.Z, PV.Z, PS, 638; EG-NEXT: SUB_INT T4.W, 0.0, PV.W, 639; EG-NEXT: RECIP_UINT * T1.Y, PV.W, 640; EG-NEXT: SETGT_INT T5.W, 0.0, T0.X, 641; EG-NEXT: MULLO_INT * T1.Z, PV.W, PS, 642; EG-NEXT: SETGT_INT T2.Z, 0.0, T0.Y, 643; EG-NEXT: ADD_INT T4.W, T0.X, PV.W, 644; EG-NEXT: MULHI * T0.X, T1.Y, PS, 645; EG-NEXT: ADD_INT T1.Y, T1.Y, PS, 646; EG-NEXT: XOR_INT T1.Z, PV.W, T5.W, 647; EG-NEXT: ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212 648; EG-NEXT: MULHI * T0.X, T1.X, T0.Z, 649; EG-NEXT: ADD_INT T0.Z, T1.X, PS, 650; EG-NEXT: XOR_INT T4.W, PV.W, T2.Z, 651; EG-NEXT: MULHI * T0.X, PV.Z, PV.Y, 652; EG-NEXT: MULHI * T0.Y, PV.W, PV.Z, 653; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W, 654; EG-NEXT: SUB_INT T4.W, T4.W, PS, 655; EG-NEXT: MULLO_INT * T0.Z, T0.X, T3.W, 656; EG-NEXT: SUB_INT T1.Y, T1.Z, PS, 657; EG-NEXT: ADD_INT T0.Z, T0.Y, 1, 658; EG-NEXT: SETGE_UINT T6.W, PV.W, T1.W, 659; EG-NEXT: SUB_INT * T7.W, PV.W, T1.W, 660; EG-NEXT: CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122 661; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PV.Z, 662; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 663; EG-NEXT: SETGE_UINT T4.W, PV.Y, T3.W, 664; EG-NEXT: SUB_INT * T6.W, PV.Y, T3.W, 665; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PS, 666; EG-NEXT: CNDE_INT T0.Z, PV.W, T0.X, PV.Z, 667; EG-NEXT: ADD_INT T4.W, PV.Y, 1, 668; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.W, 669; EG-NEXT: CNDE_INT T0.Y, PS, T0.Y, PV.W, 670; EG-NEXT: XOR_INT T1.Z, T2.Z, T0.W, BS:VEC_021/SCL_122 671; EG-NEXT: ADD_INT T0.W, PV.Z, 1, 672; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T3.W, 673; EG-NEXT: CNDE_INT T0.Z, PS, T0.Z, PV.W, 674; EG-NEXT: XOR_INT T0.W, T5.W, T2.W, 675; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z, 676; EG-NEXT: SUB_INT T0.Y, PS, T1.Z, 677; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W, 678; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 679; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 680; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 681 %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 682 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in 683 %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr 684 %result = sdiv <2 x i32> %num, %den 685 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 686 ret void 687} 688 689define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 690; GCN-LABEL: sdiv_v2i32_4: 691; GCN: ; %bb.0: 692; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 693; GCN-NEXT: s_mov_b32 s7, 0xf000 694; GCN-NEXT: s_mov_b32 s6, -1 695; GCN-NEXT: s_mov_b32 s10, s6 696; GCN-NEXT: s_mov_b32 s11, s7 697; GCN-NEXT: s_waitcnt lgkmcnt(0) 698; GCN-NEXT: s_mov_b32 s8, s2 699; GCN-NEXT: s_mov_b32 s9, s3 700; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 701; GCN-NEXT: s_mov_b32 s4, s0 702; GCN-NEXT: s_mov_b32 s5, s1 703; GCN-NEXT: s_waitcnt vmcnt(0) 704; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 705; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 706; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 707; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 708; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 709; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 710; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 711; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 712; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 713; GCN-NEXT: s_endpgm 714; 715; TONGA-LABEL: sdiv_v2i32_4: 716; TONGA: ; %bb.0: 717; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 718; TONGA-NEXT: s_mov_b32 s7, 0xf000 719; TONGA-NEXT: s_mov_b32 s6, -1 720; TONGA-NEXT: s_mov_b32 s10, s6 721; TONGA-NEXT: s_mov_b32 s11, s7 722; TONGA-NEXT: s_waitcnt lgkmcnt(0) 723; TONGA-NEXT: s_mov_b32 s8, s2 724; TONGA-NEXT: s_mov_b32 s9, s3 725; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 726; TONGA-NEXT: s_mov_b32 s4, s0 727; TONGA-NEXT: s_mov_b32 s5, s1 728; TONGA-NEXT: s_waitcnt vmcnt(0) 729; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 730; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 731; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2 732; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3 733; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 734; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 735; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 736; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 737; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 738; TONGA-NEXT: s_endpgm 739; 740; GFX9-LABEL: sdiv_v2i32_4: 741; GFX9: ; %bb.0: 742; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 743; GFX9-NEXT: s_mov_b32 s7, 0xf000 744; GFX9-NEXT: s_mov_b32 s6, -1 745; GFX9-NEXT: s_mov_b32 s10, s6 746; GFX9-NEXT: s_mov_b32 s11, s7 747; GFX9-NEXT: s_waitcnt lgkmcnt(0) 748; GFX9-NEXT: s_mov_b32 s8, s2 749; GFX9-NEXT: s_mov_b32 s9, s3 750; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 751; GFX9-NEXT: s_mov_b32 s4, s0 752; GFX9-NEXT: s_mov_b32 s5, s1 753; GFX9-NEXT: s_waitcnt vmcnt(0) 754; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 755; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 756; GFX9-NEXT: v_lshrrev_b32_e32 v2, 30, v2 757; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v3 758; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 759; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 760; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 761; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 762; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 763; GFX9-NEXT: s_endpgm 764; 765; EG-LABEL: sdiv_v2i32_4: 766; EG: ; %bb.0: 767; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 768; EG-NEXT: TEX 0 @6 769; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] 770; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 771; EG-NEXT: CF_END 772; EG-NEXT: PAD 773; EG-NEXT: Fetch clause starting at 6: 774; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 775; EG-NEXT: ALU clause starting at 8: 776; EG-NEXT: MOV * T0.X, KC0[2].Z, 777; EG-NEXT: ALU clause starting at 9: 778; EG-NEXT: ASHR * T0.W, T0.Y, literal.x, 779; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 780; EG-NEXT: LSHR T0.W, PV.W, literal.x, 781; EG-NEXT: ASHR * T1.W, T0.X, literal.y, 782; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 783; EG-NEXT: LSHR T1.W, PS, literal.x, 784; EG-NEXT: ADD_INT * T0.W, T0.Y, PV.W, 785; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 786; EG-NEXT: ASHR T0.Y, PS, literal.x, 787; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 788; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 789; EG-NEXT: ASHR T0.X, PV.W, literal.x, 790; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 791; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 792 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in 793 %result = sdiv <2 x i32> %num, <i32 4, i32 4> 794 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 795 ret void 796} 797 798define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 799; GCN-LABEL: sdiv_v4i32: 800; GCN: ; %bb.0: 801; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 802; GCN-NEXT: s_mov_b32 s11, 0xf000 803; GCN-NEXT: s_mov_b32 s10, -1 804; GCN-NEXT: s_mov_b32 s6, s10 805; GCN-NEXT: s_mov_b32 s7, s11 806; GCN-NEXT: s_waitcnt lgkmcnt(0) 807; GCN-NEXT: s_mov_b32 s4, s2 808; GCN-NEXT: s_mov_b32 s5, s3 809; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 810; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 811; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe 812; GCN-NEXT: s_mov_b32 s8, s0 813; GCN-NEXT: s_mov_b32 s9, s1 814; GCN-NEXT: s_waitcnt vmcnt(1) 815; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 816; GCN-NEXT: s_waitcnt vmcnt(0) 817; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 818; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 819; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 820; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 821; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 822; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 823; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 824; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5 825; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 826; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 827; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 828; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 829; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 830; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 831; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 832; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 833; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 834; GCN-NEXT: v_mul_f32_e32 v9, s2, v9 835; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 836; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 837; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 838; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 839; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 840; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 841; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 842; GCN-NEXT: v_mul_f32_e32 v8, s2, v8 843; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 844; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 845; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 846; GCN-NEXT: v_mul_lo_u32 v12, v12, v9 847; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 848; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 849; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 850; GCN-NEXT: v_mul_lo_u32 v10, v10, v8 851; GCN-NEXT: v_mul_hi_u32 v12, v9, v12 852; GCN-NEXT: v_mul_f32_e32 v11, s2, v11 853; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 854; GCN-NEXT: v_mul_hi_u32 v10, v8, v10 855; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 856; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 857; GCN-NEXT: v_mul_lo_u32 v12, v12, v11 858; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 859; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 860; GCN-NEXT: v_mul_hi_u32 v12, v11, v12 861; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 862; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 863; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 864; GCN-NEXT: v_cvt_f32_u32_e32 v10, v7 865; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 866; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 867; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 868; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 869; GCN-NEXT: v_mul_hi_u32 v11, v2, v11 870; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 871; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 872; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 873; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] 874; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 875; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] 876; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 877; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 878; GCN-NEXT: v_mul_lo_u32 v0, v9, v5 879; GCN-NEXT: v_cvt_u32_f32_e32 v4, v10 880; GCN-NEXT: v_mul_lo_u32 v10, v11, v6 881; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 882; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 883; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9 884; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 885; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 886; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11 887; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] 888; GCN-NEXT: v_sub_i32_e32 v9, vcc, v0, v5 889; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 890; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] 891; GCN-NEXT: v_sub_i32_e32 v11, vcc, v2, v6 892; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] 893; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1 894; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 895; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] 896; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc 897; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 898; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 899; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 900; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v16 901; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v7 902; GCN-NEXT: v_mul_lo_u32 v5, v5, v4 903; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 904; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 905; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 906; GCN-NEXT: v_xor_b32_e32 v3, v3, v9 907; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] 908; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v10 909; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 910; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 911; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 912; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc 913; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 914; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 915; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 916; GCN-NEXT: v_xor_b32_e32 v6, v9, v14 917; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 918; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 919; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 920; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 921; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v7 922; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 923; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 924; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 925; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 926; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 927; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 928; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 929; GCN-NEXT: s_endpgm 930; 931; TONGA-LABEL: sdiv_v4i32: 932; TONGA: ; %bb.0: 933; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 934; TONGA-NEXT: s_mov_b32 s11, 0xf000 935; TONGA-NEXT: s_mov_b32 s10, -1 936; TONGA-NEXT: s_mov_b32 s6, s10 937; TONGA-NEXT: s_mov_b32 s7, s11 938; TONGA-NEXT: s_waitcnt lgkmcnt(0) 939; TONGA-NEXT: s_mov_b32 s4, s2 940; TONGA-NEXT: s_mov_b32 s5, s3 941; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 942; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 943; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe 944; TONGA-NEXT: s_mov_b32 s8, s0 945; TONGA-NEXT: s_mov_b32 s9, s1 946; TONGA-NEXT: s_waitcnt vmcnt(1) 947; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 948; TONGA-NEXT: s_waitcnt vmcnt(0) 949; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 950; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 951; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 952; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 953; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 954; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 955; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 956; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v5 957; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 958; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 959; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 960; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9 961; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 962; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 963; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 964; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 965; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 966; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9 967; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 968; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 969; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 970; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 971; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 972; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 973; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 974; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8 975; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 976; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 977; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 978; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9 979; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 980; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 981; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 982; TONGA-NEXT: v_mul_lo_u32 v10, v10, v8 983; TONGA-NEXT: v_mul_hi_u32 v12, v9, v12 984; TONGA-NEXT: v_mul_f32_e32 v11, s2, v11 985; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 986; TONGA-NEXT: v_mul_hi_u32 v10, v8, v10 987; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 988; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v6 989; TONGA-NEXT: v_mul_lo_u32 v12, v12, v11 990; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 991; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 992; TONGA-NEXT: v_mul_hi_u32 v12, v11, v12 993; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 994; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 995; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 996; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v7 997; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v11 998; TONGA-NEXT: v_mul_lo_u32 v12, v8, v4 999; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 1000; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 1001; TONGA-NEXT: v_mul_hi_u32 v11, v2, v11 1002; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 1003; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 1004; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 1005; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] 1006; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 1007; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] 1008; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10 1009; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 1010; TONGA-NEXT: v_mul_lo_u32 v0, v9, v5 1011; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v10 1012; TONGA-NEXT: v_mul_lo_u32 v10, v11, v6 1013; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 1014; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 1015; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9 1016; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 1017; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 1018; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11 1019; TONGA-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] 1020; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v0, v5 1021; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 1022; TONGA-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] 1023; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v2, v6 1024; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] 1025; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v1 1026; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 1027; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] 1028; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc 1029; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15 1030; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16 1031; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1 1032; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v5 1033; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v7 1034; TONGA-NEXT: v_mul_lo_u32 v5, v5, v4 1035; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3 1036; TONGA-NEXT: v_add_u32_e32 v3, vcc, v9, v3 1037; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5 1038; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9 1039; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] 1040; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v10 1041; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 1042; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 1043; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 1044; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc 1045; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 1046; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 1047; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 1048; TONGA-NEXT: v_xor_b32_e32 v6, v9, v14 1049; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 1050; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 1051; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 1052; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1053; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v3, v7 1054; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1055; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 1056; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 1057; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1058; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 1059; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 1060; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1061; TONGA-NEXT: s_endpgm 1062; 1063; GFX9-LABEL: sdiv_v4i32: 1064; GFX9: ; %bb.0: 1065; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1066; GFX9-NEXT: s_mov_b32 s11, 0xf000 1067; GFX9-NEXT: s_mov_b32 s10, -1 1068; GFX9-NEXT: s_mov_b32 s6, s10 1069; GFX9-NEXT: s_mov_b32 s7, s11 1070; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX9-NEXT: s_mov_b32 s4, s2 1072; GFX9-NEXT: s_mov_b32 s5, s3 1073; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1074; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1075; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 1076; GFX9-NEXT: s_mov_b32 s8, s0 1077; GFX9-NEXT: s_mov_b32 s9, s1 1078; GFX9-NEXT: s_waitcnt vmcnt(1) 1079; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 1080; GFX9-NEXT: s_waitcnt vmcnt(0) 1081; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4 1082; GFX9-NEXT: v_add_u32_e32 v4, v4, v9 1083; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v5 1084; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 1085; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9 1086; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1 1087; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6 1088; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9 1089; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 1090; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 1091; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4 1092; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2 1093; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v7 1094; GFX9-NEXT: v_add_u32_e32 v1, v1, v10 1095; GFX9-NEXT: v_add_u32_e32 v6, v6, v13 1096; GFX9-NEXT: v_xor_b32_e32 v5, v5, v11 1097; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v3 1098; GFX9-NEXT: v_add_u32_e32 v2, v2, v12 1099; GFX9-NEXT: v_add_u32_e32 v7, v7, v15 1100; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11 1101; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10 1102; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13 1103; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v5 1104; GFX9-NEXT: v_add_u32_e32 v3, v3, v14 1105; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13 1106; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12 1107; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15 1108; GFX9-NEXT: v_cvt_f32_u32_e32 v12, v6 1109; GFX9-NEXT: v_xor_b32_e32 v19, v14, v15 1110; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14 1111; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7 1112; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 1113; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 1114; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 1115; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 1116; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 1117; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 1118; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10 1119; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12 1120; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 1121; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 1122; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14 1123; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 1124; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 1125; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 1126; GFX9-NEXT: v_sub_u32_e32 v11, 0, v5 1127; GFX9-NEXT: v_sub_u32_e32 v13, 0, v6 1128; GFX9-NEXT: v_mul_lo_u32 v11, v11, v10 1129; GFX9-NEXT: v_sub_u32_e32 v15, 0, v7 1130; GFX9-NEXT: v_mul_lo_u32 v13, v13, v12 1131; GFX9-NEXT: v_mul_lo_u32 v15, v15, v14 1132; GFX9-NEXT: v_mul_hi_u32 v9, v8, v9 1133; GFX9-NEXT: v_mul_hi_u32 v11, v10, v11 1134; GFX9-NEXT: v_mul_hi_u32 v13, v12, v13 1135; GFX9-NEXT: v_mul_hi_u32 v15, v14, v15 1136; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 1137; GFX9-NEXT: v_mul_hi_u32 v8, v0, v8 1138; GFX9-NEXT: v_add_u32_e32 v9, v10, v11 1139; GFX9-NEXT: v_add_u32_e32 v10, v12, v13 1140; GFX9-NEXT: v_mul_hi_u32 v9, v1, v9 1141; GFX9-NEXT: v_add_u32_e32 v11, v14, v15 1142; GFX9-NEXT: v_mul_hi_u32 v10, v2, v10 1143; GFX9-NEXT: v_mul_hi_u32 v11, v3, v11 1144; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4 1145; GFX9-NEXT: v_mul_lo_u32 v14, v9, v5 1146; GFX9-NEXT: v_mul_lo_u32 v15, v10, v6 1147; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 1148; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12 1149; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7 1150; GFX9-NEXT: v_sub_u32_e32 v1, v1, v14 1151; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 1152; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 1153; GFX9-NEXT: v_sub_u32_e32 v2, v2, v15 1154; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc 1155; GFX9-NEXT: v_sub_u32_e32 v13, v0, v4 1156; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 1157; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 1158; GFX9-NEXT: v_sub_u32_e32 v3, v3, v12 1159; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] 1160; GFX9-NEXT: v_sub_u32_e32 v14, v1, v5 1161; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v6 1162; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc 1163; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 1164; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[2:3] 1165; GFX9-NEXT: v_sub_u32_e32 v15, v2, v6 1166; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 1167; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 1168; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v14, s[0:1] 1169; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 1170; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] 1171; GFX9-NEXT: v_sub_u32_e32 v12, v3, v7 1172; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 1173; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[2:3] 1174; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc 1175; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 1176; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 1177; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5] 1178; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc 1179; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 1180; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 1181; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v15, vcc 1182; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 1183; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc 1184; GFX9-NEXT: v_xor_b32_e32 v0, v0, v16 1185; GFX9-NEXT: v_xor_b32_e32 v1, v1, v17 1186; GFX9-NEXT: v_xor_b32_e32 v2, v2, v18 1187; GFX9-NEXT: v_xor_b32_e32 v3, v3, v19 1188; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 1189; GFX9-NEXT: v_sub_u32_e32 v1, v1, v17 1190; GFX9-NEXT: v_sub_u32_e32 v2, v2, v18 1191; GFX9-NEXT: v_sub_u32_e32 v3, v3, v19 1192; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1193; GFX9-NEXT: s_endpgm 1194; 1195; EG-LABEL: sdiv_v4i32: 1196; EG: ; %bb.0: 1197; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1198; EG-NEXT: TEX 1 @6 1199; EG-NEXT: ALU 101, @11, KC0[CB0:0-32], KC1[] 1200; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 1201; EG-NEXT: CF_END 1202; EG-NEXT: PAD 1203; EG-NEXT: Fetch clause starting at 6: 1204; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 1205; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1206; EG-NEXT: ALU clause starting at 10: 1207; EG-NEXT: MOV * T0.X, KC0[2].Z, 1208; EG-NEXT: ALU clause starting at 11: 1209; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.W, 1210; EG-NEXT: ADD_INT * T1.W, T1.W, PV.W, 1211; EG-NEXT: XOR_INT * T1.W, PV.W, T2.W, 1212; EG-NEXT: SUB_INT T3.W, 0.0, PV.W, 1213; EG-NEXT: RECIP_UINT * T2.X, PV.W, 1214; EG-NEXT: SETGT_INT T4.W, 0.0, T0.W, 1215; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1216; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.Y, 1217; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, 1218; EG-NEXT: MULHI * T2.Y, T2.X, PS, 1219; EG-NEXT: ADD_INT T3.Z, T2.X, PS, 1220; EG-NEXT: XOR_INT T0.W, PV.W, T4.W, 1221; EG-NEXT: ADD_INT * T3.W, T1.Y, PV.Z, 1222; EG-NEXT: XOR_INT T3.W, PS, T2.Z, 1223; EG-NEXT: MULHI * T1.Y, PV.W, PV.Z, 1224; EG-NEXT: SUB_INT T5.W, 0.0, PV.W, 1225; EG-NEXT: RECIP_UINT * T2.X, PV.W, 1226; EG-NEXT: SETGT_INT T6.W, 0.0, T0.Y, 1227; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1228; EG-NEXT: ADD_INT T5.W, T0.Y, PV.W, 1229; EG-NEXT: MULHI * T0.Y, T2.X, PS, 1230; EG-NEXT: ADD_INT T0.Y, T2.X, PS, 1231; EG-NEXT: XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122 1232; EG-NEXT: SETGT_INT T5.W, 0.0, T1.Z, 1233; EG-NEXT: MULLO_INT * T2.X, T1.Y, T1.W, 1234; EG-NEXT: ADD_INT T7.W, T1.Z, PV.W, 1235; EG-NEXT: MULHI * T0.Y, PV.Z, PV.Y, 1236; EG-NEXT: XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122 1237; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W, 1238; EG-NEXT: SUB_INT T4.Z, 0.0, PV.W, 1239; EG-NEXT: SETGT_INT T8.W, 0.0, T1.X, 1240; EG-NEXT: RECIP_UINT * T2.Y, PV.W, 1241; EG-NEXT: ADD_INT T9.W, T1.X, PV.W, 1242; EG-NEXT: MULLO_INT * T1.X, PV.Z, PS, 1243; EG-NEXT: SETGT_INT T4.Z, 0.0, T0.Z, 1244; EG-NEXT: XOR_INT T9.W, PV.W, T8.W, 1245; EG-NEXT: MULHI * T1.X, T2.Y, PS, 1246; EG-NEXT: ADD_INT T1.X, T2.Y, PS, 1247; EG-NEXT: SUB_INT T2.Y, 0.0, PV.W, 1248; EG-NEXT: SUB_INT T1.Z, T3.Z, T1.Z, 1249; EG-NEXT: ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201 1250; EG-NEXT: RECIP_UINT * T0.Z, PV.W, 1251; EG-NEXT: XOR_INT T3.X, PV.W, T4.Z, 1252; EG-NEXT: ADD_INT T3.Y, T0.Y, 1, 1253; EG-NEXT: SETGE_UINT T3.Z, PV.Z, T3.W, 1254; EG-NEXT: SUB_INT T10.W, PV.Z, T3.W, 1255; EG-NEXT: MULLO_INT * T2.Y, PV.Y, PS, 1256; EG-NEXT: CNDE_INT T1.Z, PV.Z, T1.Z, PV.W, 1257; EG-NEXT: CNDE_INT T10.W, PV.Z, T0.Y, PV.Y, 1258; EG-NEXT: MULHI * T0.Y, PV.X, T1.X, 1259; EG-NEXT: SETGT_INT T3.Y, 0.0, T0.X, 1260; EG-NEXT: ADD_INT T3.Z, PV.W, 1, 1261; EG-NEXT: SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122 1262; EG-NEXT: MULLO_INT * T1.X, PS, T7.W, 1263; EG-NEXT: CNDE_INT T4.Y, PV.W, T10.W, PV.Z, 1264; EG-NEXT: ADD_INT T1.Z, T0.X, PV.Y, 1265; EG-NEXT: SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212 1266; EG-NEXT: MULHI * T0.X, T0.Z, T2.Y, 1267; EG-NEXT: ADD_INT T1.X, T0.Y, 1, 1268; EG-NEXT: SETGE_UINT T2.Y, PV.W, T7.W, 1269; EG-NEXT: ADD_INT T0.Z, T0.Z, PS, 1270; EG-NEXT: XOR_INT T10.W, PV.Z, T3.Y, 1271; EG-NEXT: SUB_INT * T0.W, T0.W, T2.X, 1272; EG-NEXT: SUB_INT T0.X, T3.W, T7.W, 1273; EG-NEXT: ADD_INT T5.Y, T1.Y, 1, 1274; EG-NEXT: SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122 1275; EG-NEXT: SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122 1276; EG-NEXT: MULHI * T0.Z, PV.W, PV.Z, 1277; EG-NEXT: CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122 1278; EG-NEXT: CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y, 1279; EG-NEXT: CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201 1280; EG-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201 1281; EG-NEXT: MULLO_INT * T0.X, PS, T9.W, 1282; EG-NEXT: ADD_INT T1.X, PV.W, 1, 1283; EG-NEXT: SETGE_UINT T0.Y, PV.Z, T7.W, 1284; EG-NEXT: ADD_INT T1.Z, PV.Y, 1, 1285; EG-NEXT: SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221 1286; EG-NEXT: SUB_INT * T3.W, T10.W, PS, 1287; EG-NEXT: ADD_INT T0.X, T0.Z, 1, 1288; EG-NEXT: SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221 1289; EG-NEXT: SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221 1290; EG-NEXT: CNDE_INT T1.W, PV.W, T1.Y, PV.Z, 1291; EG-NEXT: XOR_INT * T2.W, T4.W, T2.W, 1292; EG-NEXT: XOR_INT T2.X, PV.W, PS, 1293; EG-NEXT: CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122 1294; EG-NEXT: CNDE_INT T0.Z, PV.Y, T0.Z, PV.X, 1295; EG-NEXT: CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221 1296; EG-NEXT: XOR_INT * T1.W, T4.Z, T5.W, 1297; EG-NEXT: XOR_INT T0.X, T6.W, T2.Z, 1298; EG-NEXT: XOR_INT T0.Y, PV.W, PS, 1299; EG-NEXT: ADD_INT T1.Z, PV.Z, 1, 1300; EG-NEXT: SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122 1301; EG-NEXT: SUB_INT * T2.W, PV.X, T2.W, 1302; EG-NEXT: CNDE_INT T1.Y, PV.W, T0.Z, PV.Z, 1303; EG-NEXT: SUB_INT T2.Z, PV.Y, T1.W, 1304; EG-NEXT: XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122 1305; EG-NEXT: XOR_INT * T1.W, T4.Y, PV.X, 1306; EG-NEXT: SUB_INT T2.Y, PS, T0.X, 1307; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W, 1308; EG-NEXT: SUB_INT T2.X, PV.W, T0.W, 1309; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1310; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1311 %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 1312 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in 1313 %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr 1314 %result = sdiv <4 x i32> %num, %den 1315 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 1316 ret void 1317} 1318 1319define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 1320; GCN-LABEL: sdiv_v4i32_4: 1321; GCN: ; %bb.0: 1322; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1323; GCN-NEXT: s_mov_b32 s7, 0xf000 1324; GCN-NEXT: s_mov_b32 s6, -1 1325; GCN-NEXT: s_mov_b32 s10, s6 1326; GCN-NEXT: s_mov_b32 s11, s7 1327; GCN-NEXT: s_waitcnt lgkmcnt(0) 1328; GCN-NEXT: s_mov_b32 s8, s2 1329; GCN-NEXT: s_mov_b32 s9, s3 1330; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1331; GCN-NEXT: s_mov_b32 s4, s0 1332; GCN-NEXT: s_mov_b32 s5, s1 1333; GCN-NEXT: s_waitcnt vmcnt(0) 1334; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1335; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1336; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1337; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1338; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1339; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1340; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1341; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1342; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 1343; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 1344; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 1345; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 1346; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1347; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1348; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1349; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1350; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1351; GCN-NEXT: s_endpgm 1352; 1353; TONGA-LABEL: sdiv_v4i32_4: 1354; TONGA: ; %bb.0: 1355; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1356; TONGA-NEXT: s_mov_b32 s7, 0xf000 1357; TONGA-NEXT: s_mov_b32 s6, -1 1358; TONGA-NEXT: s_mov_b32 s10, s6 1359; TONGA-NEXT: s_mov_b32 s11, s7 1360; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1361; TONGA-NEXT: s_mov_b32 s8, s2 1362; TONGA-NEXT: s_mov_b32 s9, s3 1363; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1364; TONGA-NEXT: s_mov_b32 s4, s0 1365; TONGA-NEXT: s_mov_b32 s5, s1 1366; TONGA-NEXT: s_waitcnt vmcnt(0) 1367; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1368; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1369; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1370; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1371; TONGA-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1372; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1373; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1374; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1375; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 1376; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1 1377; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 1378; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 1379; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1380; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1381; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1382; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1383; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1384; TONGA-NEXT: s_endpgm 1385; 1386; GFX9-LABEL: sdiv_v4i32_4: 1387; GFX9: ; %bb.0: 1388; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1389; GFX9-NEXT: s_mov_b32 s7, 0xf000 1390; GFX9-NEXT: s_mov_b32 s6, -1 1391; GFX9-NEXT: s_mov_b32 s10, s6 1392; GFX9-NEXT: s_mov_b32 s11, s7 1393; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX9-NEXT: s_mov_b32 s8, s2 1395; GFX9-NEXT: s_mov_b32 s9, s3 1396; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1397; GFX9-NEXT: s_mov_b32 s4, s0 1398; GFX9-NEXT: s_mov_b32 s5, s1 1399; GFX9-NEXT: s_waitcnt vmcnt(0) 1400; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1401; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1402; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1403; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1404; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1405; GFX9-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1406; GFX9-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1407; GFX9-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1408; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 1409; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 1410; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 1411; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 1412; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1413; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1414; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1415; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1416; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1417; GFX9-NEXT: s_endpgm 1418; 1419; EG-LABEL: sdiv_v4i32_4: 1420; EG: ; %bb.0: 1421; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1422; EG-NEXT: TEX 0 @6 1423; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[] 1424; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 1425; EG-NEXT: CF_END 1426; EG-NEXT: PAD 1427; EG-NEXT: Fetch clause starting at 6: 1428; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1429; EG-NEXT: ALU clause starting at 8: 1430; EG-NEXT: MOV * T0.X, KC0[2].Z, 1431; EG-NEXT: ALU clause starting at 9: 1432; EG-NEXT: ASHR T1.W, T0.W, literal.x, 1433; EG-NEXT: ASHR * T2.W, T0.Z, literal.x, 1434; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1435; EG-NEXT: LSHR * T1.W, PV.W, literal.x, 1436; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1437; EG-NEXT: ADD_INT T1.Z, T0.W, PV.W, 1438; EG-NEXT: LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212 1439; EG-NEXT: ASHR * T1.W, T0.Y, literal.y, 1440; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1441; EG-NEXT: LSHR T1.Y, PS, literal.x, 1442; EG-NEXT: ASHR T2.Z, T0.X, literal.y, 1443; EG-NEXT: ADD_INT T0.W, T0.Z, PV.W, 1444; EG-NEXT: ASHR * T1.W, PV.Z, literal.z, 1445; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1446; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1447; EG-NEXT: ASHR T1.Z, PV.W, literal.x, 1448; EG-NEXT: LSHR T0.W, PV.Z, literal.y, 1449; EG-NEXT: ADD_INT * T2.W, T0.Y, PV.Y, 1450; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44) 1451; EG-NEXT: ASHR T1.Y, PS, literal.x, 1452; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 1453; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1454; EG-NEXT: ASHR T1.X, PV.W, literal.x, 1455; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1456; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1457 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in 1458 %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4> 1459 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 1460 ret void 1461} 1462 1463define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 1464; GCN-LABEL: v_sdiv_i8: 1465; GCN: ; %bb.0: 1466; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1467; GCN-NEXT: s_mov_b32 s7, 0xf000 1468; GCN-NEXT: s_mov_b32 s6, -1 1469; GCN-NEXT: s_mov_b32 s10, s6 1470; GCN-NEXT: s_mov_b32 s11, s7 1471; GCN-NEXT: s_waitcnt lgkmcnt(0) 1472; GCN-NEXT: s_mov_b32 s8, s2 1473; GCN-NEXT: s_mov_b32 s9, s3 1474; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1475; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1476; GCN-NEXT: s_mov_b32 s4, s0 1477; GCN-NEXT: s_mov_b32 s5, s1 1478; GCN-NEXT: s_waitcnt vmcnt(1) 1479; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 1480; GCN-NEXT: s_waitcnt vmcnt(0) 1481; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 1482; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 1483; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1484; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1485; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1486; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 1487; GCN-NEXT: v_trunc_f32_e32 v1, v1 1488; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 1489; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 1490; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1491; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1492; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1493; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 1494; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1495; GCN-NEXT: s_endpgm 1496; 1497; TONGA-LABEL: v_sdiv_i8: 1498; TONGA: ; %bb.0: 1499; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1500; TONGA-NEXT: s_mov_b32 s7, 0xf000 1501; TONGA-NEXT: s_mov_b32 s6, -1 1502; TONGA-NEXT: s_mov_b32 s10, s6 1503; TONGA-NEXT: s_mov_b32 s11, s7 1504; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1505; TONGA-NEXT: s_mov_b32 s8, s2 1506; TONGA-NEXT: s_mov_b32 s9, s3 1507; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1508; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1509; TONGA-NEXT: s_mov_b32 s4, s0 1510; TONGA-NEXT: s_mov_b32 s5, s1 1511; TONGA-NEXT: s_waitcnt vmcnt(1) 1512; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0 1513; TONGA-NEXT: s_waitcnt vmcnt(0) 1514; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v1 1515; TONGA-NEXT: v_xor_b32_e32 v0, v1, v0 1516; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1517; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 1518; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1519; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 1520; TONGA-NEXT: v_trunc_f32_e32 v1, v1 1521; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 1522; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 1523; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1524; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1525; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 1526; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 1527; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1528; TONGA-NEXT: s_endpgm 1529; 1530; GFX9-LABEL: v_sdiv_i8: 1531; GFX9: ; %bb.0: 1532; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1533; GFX9-NEXT: s_mov_b32 s7, 0xf000 1534; GFX9-NEXT: s_mov_b32 s6, -1 1535; GFX9-NEXT: s_mov_b32 s10, s6 1536; GFX9-NEXT: s_mov_b32 s11, s7 1537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX9-NEXT: s_mov_b32 s8, s2 1539; GFX9-NEXT: s_mov_b32 s9, s3 1540; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1541; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1542; GFX9-NEXT: s_mov_b32 s4, s0 1543; GFX9-NEXT: s_mov_b32 s5, s1 1544; GFX9-NEXT: s_waitcnt vmcnt(1) 1545; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 1546; GFX9-NEXT: s_waitcnt vmcnt(0) 1547; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 1548; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 1549; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1550; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 1551; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1552; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 1553; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1554; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 1555; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 1556; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 1557; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1558; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1559; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 1560; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1561; GFX9-NEXT: s_endpgm 1562; 1563; EG-LABEL: v_sdiv_i8: 1564; EG: ; %bb.0: 1565; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1566; EG-NEXT: TEX 1 @6 1567; EG-NEXT: ALU 21, @11, KC0[CB0:0-32], KC1[] 1568; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1569; EG-NEXT: CF_END 1570; EG-NEXT: PAD 1571; EG-NEXT: Fetch clause starting at 6: 1572; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 1573; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1574; EG-NEXT: ALU clause starting at 10: 1575; EG-NEXT: MOV * T0.X, KC0[2].Z, 1576; EG-NEXT: ALU clause starting at 11: 1577; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 1578; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1579; EG-NEXT: INT_TO_FLT * T0.Y, PV.W, 1580; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, literal.x, 1581; EG-NEXT: RECIP_IEEE * T0.X, PS, 1582; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1583; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1584; EG-NEXT: MUL_IEEE * T2.W, PS, T0.X, 1585; EG-NEXT: TRUNC T2.W, PV.W, 1586; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1587; EG-NEXT: ASHR T0.W, PS, literal.x, 1588; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z, 1589; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1590; EG-NEXT: TRUNC T0.Z, T2.W, 1591; EG-NEXT: SETGE T1.W, |PS|, |T0.Y|, 1592; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1593; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1594; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1595; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1596; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x, 1597; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1598; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 1599 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 1600 %num = load i8, i8 addrspace(1) * %in 1601 %den = load i8, i8 addrspace(1) * %den_ptr 1602 %result = sdiv i8 %num, %den 1603 %result.ext = sext i8 %result to i32 1604 store i32 %result.ext, i32 addrspace(1)* %out 1605 ret void 1606} 1607 1608define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { 1609; GCN-LABEL: v_sdiv_i23: 1610; GCN: ; %bb.0: 1611; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1612; GCN-NEXT: s_mov_b32 s7, 0xf000 1613; GCN-NEXT: s_mov_b32 s6, -1 1614; GCN-NEXT: s_mov_b32 s10, s6 1615; GCN-NEXT: s_mov_b32 s11, s7 1616; GCN-NEXT: s_waitcnt lgkmcnt(0) 1617; GCN-NEXT: s_mov_b32 s8, s2 1618; GCN-NEXT: s_mov_b32 s9, s3 1619; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 1620; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 1621; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1622; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1623; GCN-NEXT: s_mov_b32 s4, s0 1624; GCN-NEXT: s_mov_b32 s5, s1 1625; GCN-NEXT: s_waitcnt vmcnt(3) 1626; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1627; GCN-NEXT: s_waitcnt vmcnt(2) 1628; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1629; GCN-NEXT: s_waitcnt vmcnt(1) 1630; GCN-NEXT: v_or_b32_e32 v1, v2, v1 1631; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23 1632; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 1633; GCN-NEXT: s_waitcnt vmcnt(0) 1634; GCN-NEXT: v_or_b32_e32 v0, v3, v0 1635; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1636; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 1637; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1638; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 1639; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1640; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1641; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 1642; GCN-NEXT: v_trunc_f32_e32 v1, v1 1643; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 1644; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 1645; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1646; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1647; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1648; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1649; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1650; GCN-NEXT: s_endpgm 1651; 1652; TONGA-LABEL: v_sdiv_i23: 1653; TONGA: ; %bb.0: 1654; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1655; TONGA-NEXT: s_mov_b32 s7, 0xf000 1656; TONGA-NEXT: s_mov_b32 s6, -1 1657; TONGA-NEXT: s_mov_b32 s10, s6 1658; TONGA-NEXT: s_mov_b32 s11, s7 1659; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1660; TONGA-NEXT: s_mov_b32 s8, s2 1661; TONGA-NEXT: s_mov_b32 s9, s3 1662; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 1663; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 1664; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1665; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1666; TONGA-NEXT: s_mov_b32 s4, s0 1667; TONGA-NEXT: s_mov_b32 s5, s1 1668; TONGA-NEXT: s_waitcnt vmcnt(3) 1669; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1670; TONGA-NEXT: s_waitcnt vmcnt(2) 1671; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1672; TONGA-NEXT: s_waitcnt vmcnt(1) 1673; TONGA-NEXT: v_or_b32_e32 v1, v2, v1 1674; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23 1675; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 1676; TONGA-NEXT: s_waitcnt vmcnt(0) 1677; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 1678; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1679; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 1680; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 1681; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 1682; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1683; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1684; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 1685; TONGA-NEXT: v_trunc_f32_e32 v1, v1 1686; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 1687; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 1688; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1689; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1690; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 1691; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1692; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1693; TONGA-NEXT: s_endpgm 1694; 1695; GFX9-LABEL: v_sdiv_i23: 1696; GFX9: ; %bb.0: 1697; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1698; GFX9-NEXT: s_mov_b32 s7, 0xf000 1699; GFX9-NEXT: s_mov_b32 s6, -1 1700; GFX9-NEXT: s_mov_b32 s10, s6 1701; GFX9-NEXT: s_mov_b32 s11, s7 1702; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX9-NEXT: s_mov_b32 s8, s2 1704; GFX9-NEXT: s_mov_b32 s9, s3 1705; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 1706; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 1707; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1708; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1709; GFX9-NEXT: s_mov_b32 s4, s0 1710; GFX9-NEXT: s_mov_b32 s5, s1 1711; GFX9-NEXT: s_waitcnt vmcnt(3) 1712; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1713; GFX9-NEXT: s_waitcnt vmcnt(2) 1714; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1715; GFX9-NEXT: s_waitcnt vmcnt(1) 1716; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 1717; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23 1718; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 1719; GFX9-NEXT: s_waitcnt vmcnt(0) 1720; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 1721; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1722; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 1723; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 1724; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 1725; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1726; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1727; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 1728; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1729; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 1730; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 1731; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 1732; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1733; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1734; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1735; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1736; GFX9-NEXT: s_endpgm 1737; 1738; EG-LABEL: v_sdiv_i23: 1739; EG: ; %bb.0: 1740; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1741; EG-NEXT: TEX 3 @6 1742; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[] 1743; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1744; EG-NEXT: CF_END 1745; EG-NEXT: PAD 1746; EG-NEXT: Fetch clause starting at 6: 1747; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1748; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1749; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1750; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1751; EG-NEXT: ALU clause starting at 14: 1752; EG-NEXT: MOV * T0.X, KC0[2].Z, 1753; EG-NEXT: ALU clause starting at 15: 1754; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1755; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1756; EG-NEXT: OR_INT T0.W, T0.X, PV.W, 1757; EG-NEXT: LSHL * T1.W, T3.X, literal.x, 1758; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1759; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1760; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1761; EG-NEXT: ASHR T0.W, PV.W, literal.x, 1762; EG-NEXT: OR_INT * T1.W, T2.X, T1.W, 1763; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1764; EG-NEXT: LSHL T1.W, PS, literal.x, 1765; EG-NEXT: INT_TO_FLT * T0.X, PV.W, 1766; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1767; EG-NEXT: ASHR T1.W, PV.W, literal.x, 1768; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1769; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1770; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1771; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y, 1772; EG-NEXT: TRUNC T2.W, PV.W, 1773; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1774; EG-NEXT: ASHR T0.W, PS, literal.x, 1775; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z, 1776; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1777; EG-NEXT: TRUNC T0.Z, T2.W, 1778; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 1779; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1780; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1781; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1782; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1783; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1784; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1785; EG-NEXT: ASHR T0.X, PV.W, literal.x, 1786; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1787; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45) 1788 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 1789 %num = load i23, i23 addrspace(1) * %in 1790 %den = load i23, i23 addrspace(1) * %den_ptr 1791 %result = sdiv i23 %num, %den 1792 %result.ext = sext i23 %result to i32 1793 store i32 %result.ext, i32 addrspace(1)* %out 1794 ret void 1795} 1796 1797define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { 1798; GCN-LABEL: v_sdiv_i24: 1799; GCN: ; %bb.0: 1800; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1801; GCN-NEXT: s_mov_b32 s7, 0xf000 1802; GCN-NEXT: s_mov_b32 s6, -1 1803; GCN-NEXT: s_mov_b32 s10, s6 1804; GCN-NEXT: s_mov_b32 s11, s7 1805; GCN-NEXT: s_waitcnt lgkmcnt(0) 1806; GCN-NEXT: s_mov_b32 s8, s2 1807; GCN-NEXT: s_mov_b32 s9, s3 1808; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 1809; GCN-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1810; GCN-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 1811; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1812; GCN-NEXT: s_mov_b32 s4, s0 1813; GCN-NEXT: s_mov_b32 s5, s1 1814; GCN-NEXT: s_waitcnt vmcnt(3) 1815; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1816; GCN-NEXT: s_waitcnt vmcnt(2) 1817; GCN-NEXT: v_or_b32_e32 v1, v1, v4 1818; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 1819; GCN-NEXT: s_waitcnt vmcnt(1) 1820; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1821; GCN-NEXT: s_waitcnt vmcnt(0) 1822; GCN-NEXT: v_or_b32_e32 v3, v3, v4 1823; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 1824; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 1825; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 1826; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1827; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1828; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 1829; GCN-NEXT: v_trunc_f32_e32 v2, v2 1830; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 1831; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 1832; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1833; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1834; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1835; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 1836; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1837; GCN-NEXT: s_endpgm 1838; 1839; TONGA-LABEL: v_sdiv_i24: 1840; TONGA: ; %bb.0: 1841; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1842; TONGA-NEXT: s_mov_b32 s7, 0xf000 1843; TONGA-NEXT: s_mov_b32 s6, -1 1844; TONGA-NEXT: s_mov_b32 s10, s6 1845; TONGA-NEXT: s_mov_b32 s11, s7 1846; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1847; TONGA-NEXT: s_mov_b32 s8, s2 1848; TONGA-NEXT: s_mov_b32 s9, s3 1849; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 1850; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1851; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 1852; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1853; TONGA-NEXT: s_mov_b32 s4, s0 1854; TONGA-NEXT: s_mov_b32 s5, s1 1855; TONGA-NEXT: s_waitcnt vmcnt(3) 1856; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1857; TONGA-NEXT: s_waitcnt vmcnt(2) 1858; TONGA-NEXT: v_or_b32_e32 v1, v1, v4 1859; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 1860; TONGA-NEXT: s_waitcnt vmcnt(1) 1861; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1862; TONGA-NEXT: s_waitcnt vmcnt(0) 1863; TONGA-NEXT: v_or_b32_e32 v3, v3, v4 1864; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 1865; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 1866; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 1867; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1868; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1869; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 1870; TONGA-NEXT: v_trunc_f32_e32 v2, v2 1871; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 1872; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 1873; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1874; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1875; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 1876; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 1877; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1878; TONGA-NEXT: s_endpgm 1879; 1880; GFX9-LABEL: v_sdiv_i24: 1881; GFX9: ; %bb.0: 1882; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1883; GFX9-NEXT: s_mov_b32 s7, 0xf000 1884; GFX9-NEXT: s_mov_b32 s6, -1 1885; GFX9-NEXT: s_mov_b32 s10, s6 1886; GFX9-NEXT: s_mov_b32 s11, s7 1887; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1888; GFX9-NEXT: s_mov_b32 s8, s2 1889; GFX9-NEXT: s_mov_b32 s9, s3 1890; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 1891; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1892; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 1893; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1894; GFX9-NEXT: s_mov_b32 s4, s0 1895; GFX9-NEXT: s_mov_b32 s5, s1 1896; GFX9-NEXT: s_waitcnt vmcnt(3) 1897; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1898; GFX9-NEXT: s_waitcnt vmcnt(2) 1899; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 1900; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 1901; GFX9-NEXT: s_waitcnt vmcnt(1) 1902; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1903; GFX9-NEXT: s_waitcnt vmcnt(0) 1904; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 1905; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 1906; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 1907; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 1908; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1909; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1910; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 1911; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1912; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 1913; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 1914; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 1915; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1916; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1917; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 1918; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1919; GFX9-NEXT: s_endpgm 1920; 1921; EG-LABEL: v_sdiv_i24: 1922; EG: ; %bb.0: 1923; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1924; EG-NEXT: TEX 3 @6 1925; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] 1926; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1927; EG-NEXT: CF_END 1928; EG-NEXT: PAD 1929; EG-NEXT: Fetch clause starting at 6: 1930; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1931; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1932; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1933; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1934; EG-NEXT: ALU clause starting at 14: 1935; EG-NEXT: MOV * T0.X, KC0[2].Z, 1936; EG-NEXT: ALU clause starting at 15: 1937; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 1938; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1939; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1940; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1941; EG-NEXT: OR_INT * T0.W, T0.X, PV.W, 1942; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, 1943; EG-NEXT: BFE_INT T2.W, T3.X, 0.0, literal.x, 1944; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W, 1945; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1946; EG-NEXT: LSHL T2.W, PV.W, literal.x, 1947; EG-NEXT: XOR_INT * T0.W, PS, T1.W, 1948; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1949; EG-NEXT: SUB_INT T0.Z, 0.0, PS, 1950; EG-NEXT: OR_INT T2.W, T2.X, PV.W, 1951; EG-NEXT: RECIP_UINT * T0.X, PS, 1952; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W, 1953; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS, 1954; EG-NEXT: ADD_INT T2.W, T2.W, PV.W, 1955; EG-NEXT: MULHI * T0.Y, T0.X, PS, 1956; EG-NEXT: ADD_INT T4.W, T0.X, PS, 1957; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, 1958; EG-NEXT: MULHI * T0.X, PS, PV.W, 1959; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 1960; EG-NEXT: SUB_INT * T2.W, T2.W, PS, 1961; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 1962; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W, 1963; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W, 1964; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, 1965; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, 1966; EG-NEXT: ADD_INT T5.W, PS, 1, 1967; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, 1968; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 1969; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W, 1970; EG-NEXT: XOR_INT * T0.W, PV.W, PS, 1971; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, 1972; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1973; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1974; EG-NEXT: ASHR T0.X, PV.W, literal.x, 1975; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1976; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 1977 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 1978 %num = load i24, i24 addrspace(1) * %in 1979 %den = load i24, i24 addrspace(1) * %den_ptr 1980 %result = sdiv i24 %num, %den 1981 %result.ext = sext i24 %result to i32 1982 store i32 %result.ext, i32 addrspace(1)* %out 1983 ret void 1984} 1985 1986define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { 1987; GCN-LABEL: v_sdiv_i25: 1988; GCN: ; %bb.0: 1989; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1990; GCN-NEXT: s_mov_b32 s7, 0xf000 1991; GCN-NEXT: s_mov_b32 s6, -1 1992; GCN-NEXT: s_mov_b32 s10, s6 1993; GCN-NEXT: s_mov_b32 s11, s7 1994; GCN-NEXT: s_waitcnt lgkmcnt(0) 1995; GCN-NEXT: s_mov_b32 s8, s2 1996; GCN-NEXT: s_mov_b32 s9, s3 1997; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1998; GCN-NEXT: s_mov_b32 s4, s0 1999; GCN-NEXT: s_mov_b32 s5, s1 2000; GCN-NEXT: s_waitcnt vmcnt(0) 2001; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 2002; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 2003; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v2 2004; GCN-NEXT: v_xor_b32_e32 v2, v2, v1 2005; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 2006; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 2007; GCN-NEXT: v_bfe_i32 v5, v0, 0, 25 2008; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 2009; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 2010; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5 2011; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2012; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 2013; GCN-NEXT: v_xor_b32_e32 v5, v5, v0 2014; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 2015; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 2016; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 2017; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2018; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 2019; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 2020; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 2021; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v1, v5 2022; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2 2023; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2024; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v1 2025; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2026; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 2027; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 2028; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2029; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 2030; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 2031; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 2032; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2033; GCN-NEXT: s_endpgm 2034; 2035; TONGA-LABEL: v_sdiv_i25: 2036; TONGA: ; %bb.0: 2037; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2038; TONGA-NEXT: s_mov_b32 s7, 0xf000 2039; TONGA-NEXT: s_mov_b32 s6, -1 2040; TONGA-NEXT: s_mov_b32 s10, s6 2041; TONGA-NEXT: s_mov_b32 s11, s7 2042; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2043; TONGA-NEXT: s_mov_b32 s8, s2 2044; TONGA-NEXT: s_mov_b32 s9, s3 2045; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2046; TONGA-NEXT: s_mov_b32 s4, s0 2047; TONGA-NEXT: s_mov_b32 s5, s1 2048; TONGA-NEXT: s_waitcnt vmcnt(0) 2049; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 2050; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 2051; TONGA-NEXT: v_add_u32_e32 v2, vcc, v1, v2 2052; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1 2053; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 2054; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 2055; TONGA-NEXT: v_bfe_i32 v5, v0, 0, 25 2056; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 2057; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 2058; TONGA-NEXT: v_add_u32_e32 v5, vcc, v0, v5 2059; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2060; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 2061; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0 2062; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 2063; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 2064; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 2065; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 2066; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 2067; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 2068; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 2069; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v1, v5 2070; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2 2071; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2072; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v2, v1 2073; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2074; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 2075; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 2076; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2077; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 2078; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 2079; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 2080; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 2081; TONGA-NEXT: s_endpgm 2082; 2083; GFX9-LABEL: v_sdiv_i25: 2084; GFX9: ; %bb.0: 2085; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2086; GFX9-NEXT: s_mov_b32 s7, 0xf000 2087; GFX9-NEXT: s_mov_b32 s6, -1 2088; GFX9-NEXT: s_mov_b32 s10, s6 2089; GFX9-NEXT: s_mov_b32 s11, s7 2090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2091; GFX9-NEXT: s_mov_b32 s8, s2 2092; GFX9-NEXT: s_mov_b32 s9, s3 2093; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2094; GFX9-NEXT: s_mov_b32 s4, s0 2095; GFX9-NEXT: s_mov_b32 s5, s1 2096; GFX9-NEXT: s_waitcnt vmcnt(0) 2097; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25 2098; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1 2099; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 2100; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 2101; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 2102; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 2103; GFX9-NEXT: v_bfe_i32 v5, v0, 0, 25 2104; GFX9-NEXT: v_bfe_i32 v0, v0, 24, 1 2105; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 2106; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 2107; GFX9-NEXT: v_xor_b32_e32 v5, v5, v0 2108; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 2109; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2110; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2111; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 2112; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 2113; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 2114; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 2115; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 2116; GFX9-NEXT: v_add_u32_e32 v1, 1, v3 2117; GFX9-NEXT: v_sub_u32_e32 v4, v5, v4 2118; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 2119; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 2120; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 2121; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 2122; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 2123; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 2124; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 2125; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 2126; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 2127; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25 2128; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 2129; GFX9-NEXT: s_endpgm 2130; 2131; EG-LABEL: v_sdiv_i25: 2132; EG: ; %bb.0: 2133; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2134; EG-NEXT: TEX 1 @6 2135; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] 2136; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2137; EG-NEXT: CF_END 2138; EG-NEXT: PAD 2139; EG-NEXT: Fetch clause starting at 6: 2140; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 2141; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 2142; EG-NEXT: ALU clause starting at 10: 2143; EG-NEXT: MOV * T0.X, KC0[2].Z, 2144; EG-NEXT: MOV * T1.X, PV.X, 2145; EG-NEXT: ALU clause starting at 12: 2146; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2147; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2148; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 2149; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2150; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, 2151; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, 2152; EG-NEXT: LSHL * T2.W, T1.X, literal.x, 2153; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2154; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, 2155; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W, 2156; EG-NEXT: ASHR T2.W, T2.W, literal.x, 2157; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2158; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2159; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W, 2160; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS, 2161; EG-NEXT: ADD_INT T2.W, T2.W, PV.W, 2162; EG-NEXT: MULHI * T0.Y, T0.X, PS, 2163; EG-NEXT: ADD_INT T4.W, T0.X, PS, 2164; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, 2165; EG-NEXT: MULHI * T0.X, PS, PV.W, 2166; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2167; EG-NEXT: SUB_INT * T2.W, T2.W, PS, 2168; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 2169; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W, 2170; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W, 2171; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, 2172; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, 2173; EG-NEXT: ADD_INT T5.W, PS, 1, 2174; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, 2175; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 2176; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W, 2177; EG-NEXT: XOR_INT * T0.W, PV.W, PS, 2178; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, 2179; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2180; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2181; EG-NEXT: ASHR T0.X, PV.W, literal.x, 2182; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2183; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45) 2184 %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1 2185 %num = load i25, i25 addrspace(1) * %in 2186 %den = load i25, i25 addrspace(1) * %den_ptr 2187 %result = sdiv i25 %num, %den 2188 %result.ext = sext i25 %result to i32 2189 store i32 %result.ext, i32 addrspace(1)* %out 2190 ret void 2191} 2192 2193; Tests for 64-bit divide bypass. 2194; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2195; %result = sdiv i64 %a, %b 2196; store i64 %result, i64 addrspace(1)* %out, align 8 2197; ret void 2198; } 2199 2200; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2201; %result = srem i64 %a, %b 2202; store i64 %result, i64 addrspace(1)* %out, align 8 2203; ret void 2204; } 2205 2206; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 2207; %resultdiv = sdiv i64 %a, %b 2208; %resultrem = srem i64 %a, %b 2209; %result = add i64 %resultdiv, %resultrem 2210; store i64 %result, i64 addrspace(1)* %out, align 8 2211; ret void 2212; } 2213 2214define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { 2215; GCN-LABEL: scalarize_mulhs_4xi32: 2216; GCN: ; %bb.0: 2217; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2218; GCN-NEXT: s_mov_b32 s7, 0xf000 2219; GCN-NEXT: s_mov_b32 s6, -1 2220; GCN-NEXT: s_waitcnt lgkmcnt(0) 2221; GCN-NEXT: s_mov_b32 s4, s0 2222; GCN-NEXT: s_mov_b32 s5, s1 2223; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2224; GCN-NEXT: s_mov_b32 s0, 0x1389c755 2225; GCN-NEXT: s_mov_b32 s4, s2 2226; GCN-NEXT: s_mov_b32 s5, s3 2227; GCN-NEXT: s_waitcnt vmcnt(0) 2228; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 2229; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 2230; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 2231; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 2232; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2233; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2234; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2235; GCN-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2236; GCN-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2237; GCN-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2238; GCN-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2239; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2240; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 2241; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 2242; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 2243; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2244; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2245; GCN-NEXT: s_endpgm 2246; 2247; TONGA-LABEL: scalarize_mulhs_4xi32: 2248; TONGA: ; %bb.0: 2249; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2250; TONGA-NEXT: s_mov_b32 s7, 0xf000 2251; TONGA-NEXT: s_mov_b32 s6, -1 2252; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2253; TONGA-NEXT: s_mov_b32 s4, s0 2254; TONGA-NEXT: s_mov_b32 s5, s1 2255; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2256; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 2257; TONGA-NEXT: s_mov_b32 s4, s2 2258; TONGA-NEXT: s_mov_b32 s5, s3 2259; TONGA-NEXT: s_waitcnt vmcnt(0) 2260; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 2261; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 2262; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 2263; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 2264; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2265; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2266; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2267; TONGA-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2268; TONGA-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2269; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2270; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2271; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2272; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 2273; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 2274; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 2275; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 2276; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2277; TONGA-NEXT: s_endpgm 2278; 2279; GFX9-LABEL: scalarize_mulhs_4xi32: 2280; GFX9: ; %bb.0: 2281; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2282; GFX9-NEXT: s_mov_b32 s7, 0xf000 2283; GFX9-NEXT: s_mov_b32 s6, -1 2284; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2285; GFX9-NEXT: s_mov_b32 s4, s0 2286; GFX9-NEXT: s_mov_b32 s5, s1 2287; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2288; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 2289; GFX9-NEXT: s_mov_b32 s4, s2 2290; GFX9-NEXT: s_mov_b32 s5, s3 2291; GFX9-NEXT: s_waitcnt vmcnt(0) 2292; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 2293; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 2294; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 2295; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 2296; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2297; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2298; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2299; GFX9-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2300; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2301; GFX9-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2302; GFX9-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2303; GFX9-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2304; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 2305; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 2306; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 2307; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 2308; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2309; GFX9-NEXT: s_endpgm 2310; 2311; EG-LABEL: scalarize_mulhs_4xi32: 2312; EG: ; %bb.0: 2313; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 2314; EG-NEXT: TEX 0 @6 2315; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[] 2316; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2317; EG-NEXT: CF_END 2318; EG-NEXT: PAD 2319; EG-NEXT: Fetch clause starting at 6: 2320; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2321; EG-NEXT: ALU clause starting at 8: 2322; EG-NEXT: MOV * T0.X, KC0[2].Y, 2323; EG-NEXT: ALU clause starting at 9: 2324; EG-NEXT: MULHI_INT * T0.W, T0.W, literal.x, 2325; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2326; EG-NEXT: ASHR T1.Z, PS, literal.x, 2327; EG-NEXT: LSHR T0.W, PS, literal.y, 2328; EG-NEXT: MULHI_INT * T0.Z, T0.Z, literal.z, 2329; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2330; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2331; EG-NEXT: ASHR T1.Y, PS, literal.x, 2332; EG-NEXT: LSHR T0.Z, PS, literal.y, 2333; EG-NEXT: ADD_INT T0.W, PV.Z, PV.W, 2334; EG-NEXT: MULHI_INT * T0.Y, T0.Y, literal.z, 2335; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2336; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2337; EG-NEXT: ASHR T2.Y, PS, literal.x, 2338; EG-NEXT: ADD_INT T0.Z, PV.Y, PV.Z, 2339; EG-NEXT: LSHR T1.W, PS, literal.y, 2340; EG-NEXT: MULHI_INT * T0.X, T0.X, literal.z, 2341; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2342; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2343; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.W, 2344; EG-NEXT: ASHR T1.W, PS, literal.x, 2345; EG-NEXT: LSHR * T2.W, PS, literal.y, 2346; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2347; EG-NEXT: ADD_INT T0.X, PV.W, PS, 2348; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, 2349; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2350 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 2351 %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668> 2352 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 2353 ret void 2354} 2355