1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030 6; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG 7 8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 9; SI-LABEL: udiv_i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_cvt_f32_u32_e32 v2, v1 24; SI-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 25; SI-NEXT: v_rcp_iflag_f32_e32 v2, v2 26; SI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 27; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 28; SI-NEXT: v_mul_lo_u32 v3, v3, v2 29; SI-NEXT: v_mul_hi_u32 v3, v2, v3 30; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 31; SI-NEXT: v_mul_hi_u32 v2, v0, v2 32; SI-NEXT: v_mul_lo_u32 v3, v2, v1 33; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 34; SI-NEXT: v_subrev_i32_e32 v0, vcc, v3, v0 35; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 36; SI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 37; SI-NEXT: v_subrev_i32_e32 v3, vcc, v1, v0 38; SI-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 39; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v2 40; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 41; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 42; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 43; SI-NEXT: s_endpgm 44; 45; VI-LABEL: udiv_i32: 46; VI: ; %bb.0: 47; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 48; VI-NEXT: s_mov_b32 s7, 0xf000 49; VI-NEXT: s_mov_b32 s6, -1 50; VI-NEXT: s_mov_b32 s10, s6 51; VI-NEXT: s_mov_b32 s11, s7 52; VI-NEXT: s_waitcnt lgkmcnt(0) 53; VI-NEXT: s_mov_b32 s8, s2 54; VI-NEXT: s_mov_b32 s9, s3 55; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 56; VI-NEXT: s_mov_b32 s4, s0 57; VI-NEXT: s_mov_b32 s5, s1 58; VI-NEXT: s_waitcnt vmcnt(0) 59; VI-NEXT: v_cvt_f32_u32_e32 v2, v1 60; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 61; VI-NEXT: v_rcp_iflag_f32_e32 v2, v2 62; VI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 63; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 64; VI-NEXT: v_mul_lo_u32 v3, v3, v2 65; VI-NEXT: v_mul_hi_u32 v3, v2, v3 66; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 67; VI-NEXT: v_mul_hi_u32 v2, v0, v2 68; VI-NEXT: v_mul_lo_u32 v3, v2, v1 69; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 70; VI-NEXT: v_subrev_u32_e32 v0, vcc, v3, v0 71; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 72; VI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 73; VI-NEXT: v_subrev_u32_e32 v3, vcc, v1, v0 74; VI-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 75; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2 76; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 77; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 78; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 79; VI-NEXT: s_endpgm 80; 81; GCN-LABEL: udiv_i32: 82; GCN: ; %bb.0: 83; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 84; GCN-NEXT: s_waitcnt lgkmcnt(0) 85; GCN-NEXT: v_mov_b32_e32 v0, s2 86; GCN-NEXT: v_mov_b32_e32 v1, s3 87; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 88; GCN-NEXT: s_waitcnt vmcnt(0) 89; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 90; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 91; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 92; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 93; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 94; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 95; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 96; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 97; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 98; GCN-NEXT: v_mov_b32_e32 v2, s0 99; GCN-NEXT: v_mov_b32_e32 v3, s1 100; GCN-NEXT: v_mul_lo_u32 v5, v4, v1 101; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 102; GCN-NEXT: v_subrev_u32_e32 v0, vcc, v5, v0 103; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 104; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 105; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v1, v0 106; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] 107; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 108; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 109; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 110; GCN-NEXT: flat_store_dword v[2:3], v0 111; GCN-NEXT: s_endpgm 112; 113; GFX1030-LABEL: udiv_i32: 114; GFX1030: ; %bb.0: 115; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 116; GFX1030-NEXT: v_mov_b32_e32 v2, 0 117; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 118; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 119; GFX1030-NEXT: s_waitcnt vmcnt(0) 120; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, v1 121; GFX1030-NEXT: v_sub_nc_u32_e32 v4, 0, v1 122; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3 123; GFX1030-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 124; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 125; GFX1030-NEXT: v_mul_lo_u32 v4, v4, v3 126; GFX1030-NEXT: v_mul_hi_u32 v4, v3, v4 127; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v4 128; GFX1030-NEXT: v_mul_hi_u32 v3, v0, v3 129; GFX1030-NEXT: v_mul_lo_u32 v4, v3, v1 130; GFX1030-NEXT: v_sub_nc_u32_e32 v0, v0, v4 131; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v3 132; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v0, v1 133; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v1 134; GFX1030-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo 135; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo 136; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v3 137; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v1 138; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo 139; GFX1030-NEXT: global_store_dword v2, v0, s[0:1] 140; GFX1030-NEXT: s_endpgm 141; 142; EG-LABEL: udiv_i32: 143; EG: ; %bb.0: 144; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 145; EG-NEXT: TEX 0 @6 146; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[] 147; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 148; EG-NEXT: CF_END 149; EG-NEXT: PAD 150; EG-NEXT: Fetch clause starting at 6: 151; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 152; EG-NEXT: ALU clause starting at 8: 153; EG-NEXT: MOV * T0.X, KC0[2].Z, 154; EG-NEXT: ALU clause starting at 9: 155; EG-NEXT: SUB_INT T0.W, 0.0, T0.Y, 156; EG-NEXT: RECIP_UINT * T0.Z, T0.Y, 157; EG-NEXT: MULLO_INT * T0.W, PV.W, PS, 158; EG-NEXT: MULHI * T0.W, T0.Z, PS, 159; EG-NEXT: ADD_INT * T0.W, T0.Z, PS, 160; EG-NEXT: MULHI * T0.Z, T0.X, PV.W, 161; EG-NEXT: MULLO_INT * T0.W, PS, T0.Y, 162; EG-NEXT: SUB_INT * T0.W, T0.X, PS, 163; EG-NEXT: ADD_INT T1.Z, T0.Z, 1, 164; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y, 165; EG-NEXT: SUB_INT * T2.W, PV.W, T0.Y, 166; EG-NEXT: CNDE_INT T0.W, PV.W, T0.W, PS, 167; EG-NEXT: CNDE_INT * T1.W, PV.W, T0.Z, PV.Z, 168; EG-NEXT: ADD_INT T2.W, PS, 1, 169; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.Y, 170; EG-NEXT: CNDE_INT T0.X, PS, T1.W, PV.W, 171; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 172; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 173 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 174 %a = load i32, i32 addrspace(1)* %in 175 %b = load i32, i32 addrspace(1)* %b_ptr 176 %result = udiv i32 %a, %b 177 store i32 %result, i32 addrspace(1)* %out 178 ret void 179} 180 181define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { 182; SI-LABEL: s_udiv_i32: 183; SI: ; %bb.0: 184; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 185; SI-NEXT: s_mov_b32 s7, 0xf000 186; SI-NEXT: s_mov_b32 s6, -1 187; SI-NEXT: s_waitcnt lgkmcnt(0) 188; SI-NEXT: v_cvt_f32_u32_e32 v0, s3 189; SI-NEXT: s_sub_i32 s4, 0, s3 190; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 191; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 192; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 193; SI-NEXT: v_mul_lo_u32 v1, s4, v0 194; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 195; SI-NEXT: v_mul_hi_u32 v1, v0, v1 196; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 197; SI-NEXT: v_mul_hi_u32 v0, s2, v0 198; SI-NEXT: v_mul_lo_u32 v1, v0, s3 199; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 200; SI-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 201; SI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 202; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 203; SI-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 204; SI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 205; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 206; SI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 207; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 208; SI-NEXT: s_waitcnt lgkmcnt(0) 209; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 210; SI-NEXT: s_endpgm 211; 212; VI-LABEL: s_udiv_i32: 213; VI: ; %bb.0: 214; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 215; VI-NEXT: s_mov_b32 s7, 0xf000 216; VI-NEXT: s_mov_b32 s6, -1 217; VI-NEXT: s_waitcnt lgkmcnt(0) 218; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 219; VI-NEXT: s_sub_i32 s4, 0, s3 220; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 221; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 222; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 223; VI-NEXT: v_mul_lo_u32 v1, s4, v0 224; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 225; VI-NEXT: v_mul_hi_u32 v1, v0, v1 226; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 227; VI-NEXT: v_mul_hi_u32 v0, s2, v0 228; VI-NEXT: v_mul_lo_u32 v1, v0, s3 229; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 230; VI-NEXT: v_sub_u32_e32 v1, vcc, s2, v1 231; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 232; VI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 233; VI-NEXT: v_subrev_u32_e32 v2, vcc, s3, v1 234; VI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 235; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 236; VI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 237; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 238; VI-NEXT: s_waitcnt lgkmcnt(0) 239; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 240; VI-NEXT: s_endpgm 241; 242; GCN-LABEL: s_udiv_i32: 243; GCN: ; %bb.0: 244; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 245; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 246; GCN-NEXT: s_waitcnt lgkmcnt(0) 247; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 248; GCN-NEXT: s_sub_i32 s0, 0, s3 249; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 250; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 251; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 252; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 253; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 254; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0 255; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 256; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 257; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 258; GCN-NEXT: v_sub_u32_e32 v1, vcc, s2, v1 259; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 260; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 261; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s3, v1 262; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 263; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 264; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 265; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc 266; GCN-NEXT: v_mov_b32_e32 v0, s4 267; GCN-NEXT: v_mov_b32_e32 v1, s5 268; GCN-NEXT: flat_store_dword v[0:1], v2 269; GCN-NEXT: s_endpgm 270; 271; GFX1030-LABEL: s_udiv_i32: 272; GFX1030: ; %bb.0: 273; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 274; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 275; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s1 276; GFX1030-NEXT: s_sub_i32 s2, 0, s1 277; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 278; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 279; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 280; GFX1030-NEXT: v_mul_lo_u32 v1, s2, v0 281; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 282; GFX1030-NEXT: v_mul_hi_u32 v1, v0, v1 283; GFX1030-NEXT: v_add_nc_u32_e32 v0, v0, v1 284; GFX1030-NEXT: v_mul_hi_u32 v0, s0, v0 285; GFX1030-NEXT: v_mul_lo_u32 v1, v0, s1 286; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 287; GFX1030-NEXT: v_sub_nc_u32_e32 v1, s0, v1 288; GFX1030-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 289; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 290; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo 291; GFX1030-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 292; GFX1030-NEXT: v_mov_b32_e32 v3, 0 293; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 294; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 295; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo 296; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1030-NEXT: global_store_dword v3, v0, s[2:3] 298; GFX1030-NEXT: s_endpgm 299; 300; EG-LABEL: s_udiv_i32: 301; EG: ; %bb.0: 302; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 303; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 304; EG-NEXT: CF_END 305; EG-NEXT: PAD 306; EG-NEXT: ALU clause starting at 4: 307; EG-NEXT: SUB_INT T0.W, 0.0, KC0[2].W, 308; EG-NEXT: RECIP_UINT * T0.X, KC0[2].W, 309; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS, 310; EG-NEXT: MULHI * T0.Y, T0.X, PS, 311; EG-NEXT: ADD_INT * T0.W, T0.X, PS, 312; EG-NEXT: MULHI * T0.X, KC0[2].Z, PV.W, 313; EG-NEXT: MULLO_INT * T0.Y, PS, KC0[2].W, 314; EG-NEXT: SUB_INT * T0.W, KC0[2].Z, PS, 315; EG-NEXT: SUB_INT T0.Z, PV.W, KC0[2].W, 316; EG-NEXT: SETGE_UINT T1.W, PV.W, KC0[2].W, 317; EG-NEXT: ADD_INT * T2.W, T0.X, 1, 318; EG-NEXT: CNDE_INT T2.W, PV.W, T0.X, PS, 319; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, PV.Z, 320; EG-NEXT: SETGE_UINT T0.W, PS, KC0[2].W, 321; EG-NEXT: ADD_INT * T1.W, PV.W, 1, 322; EG-NEXT: CNDE_INT T0.X, PV.W, T2.W, PS, 323; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 324; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 325 %result = udiv i32 %a, %b 326 store i32 %result, i32 addrspace(1)* %out 327 ret void 328} 329 330 331; The code generated by udiv is long and complex and may frequently 332; change. The goal of this test is to make sure the ISel doesn't fail 333; when it gets a v4i32 udiv 334define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 335; SI-LABEL: udiv_v2i32: 336; SI: ; %bb.0: 337; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 338; SI-NEXT: s_mov_b32 s7, 0xf000 339; SI-NEXT: s_mov_b32 s6, -1 340; SI-NEXT: s_mov_b32 s10, s6 341; SI-NEXT: s_mov_b32 s11, s7 342; SI-NEXT: s_waitcnt lgkmcnt(0) 343; SI-NEXT: s_mov_b32 s8, s2 344; SI-NEXT: s_mov_b32 s9, s3 345; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 346; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe 347; SI-NEXT: s_mov_b32 s4, s0 348; SI-NEXT: s_mov_b32 s5, s1 349; SI-NEXT: s_waitcnt vmcnt(0) 350; SI-NEXT: v_cvt_f32_u32_e32 v4, v2 351; SI-NEXT: v_cvt_f32_u32_e32 v5, v3 352; SI-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 353; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4 354; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5 355; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 356; SI-NEXT: v_mul_f32_e32 v4, s2, v4 357; SI-NEXT: v_mul_f32_e32 v5, s2, v5 358; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 359; SI-NEXT: v_cvt_u32_f32_e32 v5, v5 360; SI-NEXT: v_mul_lo_u32 v6, v6, v4 361; SI-NEXT: v_mul_lo_u32 v7, v7, v5 362; SI-NEXT: v_mul_hi_u32 v6, v4, v6 363; SI-NEXT: v_mul_hi_u32 v7, v5, v7 364; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 365; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 366; SI-NEXT: v_mul_hi_u32 v4, v0, v4 367; SI-NEXT: v_mul_hi_u32 v5, v1, v5 368; SI-NEXT: v_mul_lo_u32 v6, v4, v2 369; SI-NEXT: v_mul_lo_u32 v8, v5, v3 370; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v4 371; SI-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0 372; SI-NEXT: v_subrev_i32_e32 v1, vcc, v8, v1 373; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 374; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 375; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 376; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 377; SI-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 378; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] 379; SI-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 380; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 381; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4 382; SI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 383; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v5 384; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 385; SI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 386; SI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 387; SI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 388; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 389; SI-NEXT: s_endpgm 390; 391; VI-LABEL: udiv_v2i32: 392; VI: ; %bb.0: 393; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 394; VI-NEXT: s_mov_b32 s7, 0xf000 395; VI-NEXT: s_mov_b32 s6, -1 396; VI-NEXT: s_mov_b32 s10, s6 397; VI-NEXT: s_mov_b32 s11, s7 398; VI-NEXT: s_waitcnt lgkmcnt(0) 399; VI-NEXT: s_mov_b32 s8, s2 400; VI-NEXT: s_mov_b32 s9, s3 401; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 402; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe 403; VI-NEXT: s_mov_b32 s4, s0 404; VI-NEXT: s_mov_b32 s5, s1 405; VI-NEXT: s_waitcnt vmcnt(0) 406; VI-NEXT: v_cvt_f32_u32_e32 v4, v2 407; VI-NEXT: v_cvt_f32_u32_e32 v5, v3 408; VI-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 409; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4 410; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5 411; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3 412; VI-NEXT: v_mul_f32_e32 v4, s2, v4 413; VI-NEXT: v_mul_f32_e32 v5, s2, v5 414; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 415; VI-NEXT: v_cvt_u32_f32_e32 v5, v5 416; VI-NEXT: v_mul_lo_u32 v6, v6, v4 417; VI-NEXT: v_mul_lo_u32 v7, v7, v5 418; VI-NEXT: v_mul_hi_u32 v6, v4, v6 419; VI-NEXT: v_mul_hi_u32 v7, v5, v7 420; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 421; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5 422; VI-NEXT: v_mul_hi_u32 v4, v0, v4 423; VI-NEXT: v_mul_hi_u32 v5, v1, v5 424; VI-NEXT: v_mul_lo_u32 v6, v4, v2 425; VI-NEXT: v_mul_lo_u32 v8, v5, v3 426; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4 427; VI-NEXT: v_subrev_u32_e32 v0, vcc, v6, v0 428; VI-NEXT: v_subrev_u32_e32 v1, vcc, v8, v1 429; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 430; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 431; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 432; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 433; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 434; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] 435; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 436; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 437; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4 438; VI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 439; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v5 440; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 441; VI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 442; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 443; VI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 444; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 445; VI-NEXT: s_endpgm 446; 447; GCN-LABEL: udiv_v2i32: 448; GCN: ; %bb.0: 449; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 450; GCN-NEXT: s_waitcnt lgkmcnt(0) 451; GCN-NEXT: v_mov_b32_e32 v0, s2 452; GCN-NEXT: v_mov_b32_e32 v1, s3 453; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 454; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe 455; GCN-NEXT: s_waitcnt vmcnt(0) 456; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 457; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 458; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 459; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 460; GCN-NEXT: v_mul_f32_e32 v4, s2, v4 461; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4 462; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 463; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5 464; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 465; GCN-NEXT: v_mul_lo_u32 v5, v4, v6 466; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 467; GCN-NEXT: v_mul_lo_u32 v8, v4, v7 468; GCN-NEXT: v_mul_hi_u32 v9, v6, v5 469; GCN-NEXT: v_mov_b32_e32 v4, s0 470; GCN-NEXT: v_mov_b32_e32 v5, s1 471; GCN-NEXT: v_mul_hi_u32 v8, v7, v8 472; GCN-NEXT: v_add_u32_e32 v6, vcc, v9, v6 473; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 474; GCN-NEXT: v_add_u32_e32 v7, vcc, v8, v7 475; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 476; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 477; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v6 478; GCN-NEXT: v_mul_lo_u32 v10, v7, v3 479; GCN-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0 480; GCN-NEXT: v_add_u32_e32 v11, vcc, 1, v7 481; GCN-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1 482; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 483; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 484; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] 485; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0 486; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] 487; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1 488; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] 489; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6 490; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] 491; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v7 492; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 493; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc 494; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 495; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc 496; GCN-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 497; GCN-NEXT: s_endpgm 498; 499; GFX1030-LABEL: udiv_v2i32: 500; GFX1030: ; %bb.0: 501; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 502; GFX1030-NEXT: v_mov_b32_e32 v4, 0 503; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe 504; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 505; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] 506; GFX1030-NEXT: s_waitcnt vmcnt(0) 507; GFX1030-NEXT: v_cvt_f32_u32_e32 v5, v2 508; GFX1030-NEXT: v_cvt_f32_u32_e32 v6, v3 509; GFX1030-NEXT: v_sub_nc_u32_e32 v7, 0, v2 510; GFX1030-NEXT: v_sub_nc_u32_e32 v8, 0, v3 511; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v5 512; GFX1030-NEXT: v_rcp_iflag_f32_e32 v6, v6 513; GFX1030-NEXT: v_mul_f32_e32 v5, s0, v5 514; GFX1030-NEXT: v_mul_f32_e32 v6, s0, v6 515; GFX1030-NEXT: v_cvt_u32_f32_e32 v5, v5 516; GFX1030-NEXT: v_cvt_u32_f32_e32 v6, v6 517; GFX1030-NEXT: v_mul_lo_u32 v7, v7, v5 518; GFX1030-NEXT: v_mul_lo_u32 v8, v8, v6 519; GFX1030-NEXT: v_mul_hi_u32 v7, v5, v7 520; GFX1030-NEXT: v_mul_hi_u32 v8, v6, v8 521; GFX1030-NEXT: v_add_nc_u32_e32 v5, v5, v7 522; GFX1030-NEXT: v_add_nc_u32_e32 v6, v6, v8 523; GFX1030-NEXT: v_mul_hi_u32 v5, v0, v5 524; GFX1030-NEXT: v_mul_hi_u32 v6, v1, v6 525; GFX1030-NEXT: v_mul_lo_u32 v7, v5, v2 526; GFX1030-NEXT: v_mul_lo_u32 v8, v6, v3 527; GFX1030-NEXT: v_sub_nc_u32_e32 v0, v0, v7 528; GFX1030-NEXT: v_add_nc_u32_e32 v7, 1, v5 529; GFX1030-NEXT: v_sub_nc_u32_e32 v1, v1, v8 530; GFX1030-NEXT: v_add_nc_u32_e32 v8, 1, v6 531; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2 532; GFX1030-NEXT: v_sub_nc_u32_e32 v9, v1, v3 533; GFX1030-NEXT: v_cmp_ge_u32_e64 s0, v1, v3 534; GFX1030-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo 535; GFX1030-NEXT: v_sub_nc_u32_e32 v7, v0, v2 536; GFX1030-NEXT: v_cndmask_b32_e64 v6, v6, v8, s0 537; GFX1030-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 538; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo 539; GFX1030-NEXT: v_add_nc_u32_e32 v7, 1, v5 540; GFX1030-NEXT: v_add_nc_u32_e32 v8, 1, v6 541; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2 542; GFX1030-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc_lo 543; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v3 544; GFX1030-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo 545; GFX1030-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 546; GFX1030-NEXT: s_endpgm 547; 548; EG-LABEL: udiv_v2i32: 549; EG: ; %bb.0: 550; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 551; EG-NEXT: TEX 1 @6 552; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[] 553; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 554; EG-NEXT: CF_END 555; EG-NEXT: PAD 556; EG-NEXT: Fetch clause starting at 6: 557; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 558; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 559; EG-NEXT: ALU clause starting at 10: 560; EG-NEXT: MOV * T0.X, KC0[2].Z, 561; EG-NEXT: ALU clause starting at 11: 562; EG-NEXT: SUB_INT T0.W, 0.0, T1.Y, 563; EG-NEXT: RECIP_UINT * T0.Z, T1.Y, 564; EG-NEXT: MULLO_INT * T0.W, PV.W, PS, 565; EG-NEXT: SUB_INT T1.W, 0.0, T1.X, 566; EG-NEXT: RECIP_UINT * T1.Z, T1.X, 567; EG-NEXT: MULLO_INT * T1.W, PV.W, PS, 568; EG-NEXT: MULHI * T1.W, T1.Z, PS, 569; EG-NEXT: ADD_INT T1.W, T1.Z, PS, 570; EG-NEXT: MULHI * T0.W, T0.Z, T0.W, 571; EG-NEXT: ADD_INT T0.W, T0.Z, PS, 572; EG-NEXT: MULHI * T0.Z, T0.X, PV.W, 573; EG-NEXT: MULHI * T0.W, T0.Y, PV.W, 574; EG-NEXT: MULLO_INT * T1.Z, PS, T1.Y, 575; EG-NEXT: SUB_INT T1.W, T0.Y, PS, 576; EG-NEXT: MULLO_INT * T0.Y, T0.Z, T1.X, 577; EG-NEXT: SUB_INT T0.Y, T0.X, PS, 578; EG-NEXT: ADD_INT T1.Z, T0.W, 1, 579; EG-NEXT: SETGE_UINT T2.W, PV.W, T1.Y, 580; EG-NEXT: SUB_INT * T3.W, PV.W, T1.Y, 581; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, 582; EG-NEXT: CNDE_INT T2.Y, PV.W, T0.W, PV.Z, BS:VEC_021/SCL_122 583; EG-NEXT: ADD_INT T1.Z, T0.Z, 1, 584; EG-NEXT: SETGE_UINT T0.W, PV.Y, T1.X, 585; EG-NEXT: SUB_INT * T1.W, PV.Y, T1.X, 586; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PS, BS:VEC_021/SCL_122 587; EG-NEXT: CNDE_INT T0.Z, PV.W, T0.Z, PV.Z, 588; EG-NEXT: ADD_INT T0.W, PV.Y, 1, 589; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.Y, 590; EG-NEXT: CNDE_INT T1.Y, PS, T2.Y, PV.W, 591; EG-NEXT: ADD_INT T0.W, PV.Z, 1, 592; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T1.X, 593; EG-NEXT: CNDE_INT T1.X, PS, T0.Z, PV.W, 594; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 595; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 596 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 597 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in 598 %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr 599 %result = udiv <2 x i32> %a, %b 600 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 601 ret void 602} 603 604define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 605; SI-LABEL: udiv_v4i32: 606; SI: ; %bb.0: 607; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 608; SI-NEXT: s_mov_b32 s11, 0xf000 609; SI-NEXT: s_mov_b32 s10, -1 610; SI-NEXT: s_mov_b32 s6, s10 611; SI-NEXT: s_mov_b32 s7, s11 612; SI-NEXT: s_waitcnt lgkmcnt(0) 613; SI-NEXT: s_mov_b32 s4, s2 614; SI-NEXT: s_mov_b32 s5, s3 615; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 616; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 617; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe 618; SI-NEXT: s_mov_b32 s8, s0 619; SI-NEXT: s_mov_b32 s9, s1 620; SI-NEXT: s_waitcnt vmcnt(1) 621; SI-NEXT: v_cvt_f32_u32_e32 v8, v0 622; SI-NEXT: v_cvt_f32_u32_e32 v10, v1 623; SI-NEXT: v_cvt_f32_u32_e32 v12, v2 624; SI-NEXT: v_cvt_f32_u32_e32 v14, v3 625; SI-NEXT: v_rcp_iflag_f32_e32 v8, v8 626; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 627; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 628; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14 629; SI-NEXT: v_mul_f32_e32 v8, s2, v8 630; SI-NEXT: v_mul_f32_e32 v10, s2, v10 631; SI-NEXT: v_mul_f32_e32 v12, s2, v12 632; SI-NEXT: v_mul_f32_e32 v14, s2, v14 633; SI-NEXT: v_cvt_u32_f32_e32 v8, v8 634; SI-NEXT: v_cvt_u32_f32_e32 v10, v10 635; SI-NEXT: v_cvt_u32_f32_e32 v12, v12 636; SI-NEXT: v_cvt_u32_f32_e32 v14, v14 637; SI-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 638; SI-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 639; SI-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 640; SI-NEXT: v_sub_i32_e32 v15, vcc, 0, v3 641; SI-NEXT: v_mul_lo_u32 v9, v9, v8 642; SI-NEXT: v_mul_lo_u32 v11, v11, v10 643; SI-NEXT: v_mul_lo_u32 v13, v13, v12 644; SI-NEXT: v_mul_lo_u32 v15, v15, v14 645; SI-NEXT: v_mul_hi_u32 v9, v8, v9 646; SI-NEXT: v_mul_hi_u32 v11, v10, v11 647; SI-NEXT: v_mul_hi_u32 v13, v12, v13 648; SI-NEXT: v_mul_hi_u32 v15, v14, v15 649; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8 650; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10 651; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v12 652; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14 653; SI-NEXT: s_waitcnt vmcnt(0) 654; SI-NEXT: v_mul_hi_u32 v8, v4, v8 655; SI-NEXT: v_mul_hi_u32 v9, v5, v9 656; SI-NEXT: v_mul_hi_u32 v10, v6, v10 657; SI-NEXT: v_mul_hi_u32 v11, v7, v11 658; SI-NEXT: v_mul_lo_u32 v12, v8, v0 659; SI-NEXT: v_mul_lo_u32 v14, v9, v1 660; SI-NEXT: v_mul_lo_u32 v16, v10, v2 661; SI-NEXT: v_mul_lo_u32 v18, v11, v3 662; SI-NEXT: v_subrev_i32_e32 v4, vcc, v12, v4 663; SI-NEXT: v_subrev_i32_e32 v5, vcc, v14, v5 664; SI-NEXT: v_subrev_i32_e32 v6, vcc, v16, v6 665; SI-NEXT: v_subrev_i32_e32 v7, vcc, v18, v7 666; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v8 667; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v9 668; SI-NEXT: v_add_i32_e32 v17, vcc, 1, v10 669; SI-NEXT: v_add_i32_e32 v19, vcc, 1, v11 670; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 671; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 672; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 673; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 674; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] 675; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4 676; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] 677; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 678; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] 679; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 680; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] 681; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 682; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] 683; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 684; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] 685; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v9 686; SI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] 687; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v10 688; SI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] 689; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v11 690; SI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 691; SI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc 692; SI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 693; SI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc 694; SI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 695; SI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc 696; SI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 697; SI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc 698; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 699; SI-NEXT: s_endpgm 700; 701; VI-LABEL: udiv_v4i32: 702; VI: ; %bb.0: 703; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 704; VI-NEXT: s_mov_b32 s11, 0xf000 705; VI-NEXT: s_mov_b32 s10, -1 706; VI-NEXT: s_mov_b32 s6, s10 707; VI-NEXT: s_mov_b32 s7, s11 708; VI-NEXT: s_waitcnt lgkmcnt(0) 709; VI-NEXT: s_mov_b32 s4, s2 710; VI-NEXT: s_mov_b32 s5, s3 711; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 712; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 713; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe 714; VI-NEXT: s_mov_b32 s8, s0 715; VI-NEXT: s_mov_b32 s9, s1 716; VI-NEXT: s_waitcnt vmcnt(1) 717; VI-NEXT: v_cvt_f32_u32_e32 v8, v0 718; VI-NEXT: v_cvt_f32_u32_e32 v10, v1 719; VI-NEXT: v_cvt_f32_u32_e32 v12, v2 720; VI-NEXT: v_cvt_f32_u32_e32 v14, v3 721; VI-NEXT: v_rcp_iflag_f32_e32 v8, v8 722; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 723; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 724; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14 725; VI-NEXT: v_mul_f32_e32 v8, s2, v8 726; VI-NEXT: v_mul_f32_e32 v10, s2, v10 727; VI-NEXT: v_mul_f32_e32 v12, s2, v12 728; VI-NEXT: v_mul_f32_e32 v14, s2, v14 729; VI-NEXT: v_cvt_u32_f32_e32 v8, v8 730; VI-NEXT: v_cvt_u32_f32_e32 v10, v10 731; VI-NEXT: v_cvt_u32_f32_e32 v12, v12 732; VI-NEXT: v_cvt_u32_f32_e32 v14, v14 733; VI-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 734; VI-NEXT: v_sub_u32_e32 v11, vcc, 0, v1 735; VI-NEXT: v_sub_u32_e32 v13, vcc, 0, v2 736; VI-NEXT: v_sub_u32_e32 v15, vcc, 0, v3 737; VI-NEXT: v_mul_lo_u32 v9, v9, v8 738; VI-NEXT: v_mul_lo_u32 v11, v11, v10 739; VI-NEXT: v_mul_lo_u32 v13, v13, v12 740; VI-NEXT: v_mul_lo_u32 v15, v15, v14 741; VI-NEXT: v_mul_hi_u32 v9, v8, v9 742; VI-NEXT: v_mul_hi_u32 v11, v10, v11 743; VI-NEXT: v_mul_hi_u32 v13, v12, v13 744; VI-NEXT: v_mul_hi_u32 v15, v14, v15 745; VI-NEXT: v_add_u32_e32 v8, vcc, v9, v8 746; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 747; VI-NEXT: v_add_u32_e32 v10, vcc, v13, v12 748; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 749; VI-NEXT: s_waitcnt vmcnt(0) 750; VI-NEXT: v_mul_hi_u32 v8, v4, v8 751; VI-NEXT: v_mul_hi_u32 v9, v5, v9 752; VI-NEXT: v_mul_hi_u32 v10, v6, v10 753; VI-NEXT: v_mul_hi_u32 v11, v7, v11 754; VI-NEXT: v_mul_lo_u32 v12, v8, v0 755; VI-NEXT: v_mul_lo_u32 v14, v9, v1 756; VI-NEXT: v_mul_lo_u32 v16, v10, v2 757; VI-NEXT: v_mul_lo_u32 v18, v11, v3 758; VI-NEXT: v_subrev_u32_e32 v4, vcc, v12, v4 759; VI-NEXT: v_subrev_u32_e32 v5, vcc, v14, v5 760; VI-NEXT: v_subrev_u32_e32 v6, vcc, v16, v6 761; VI-NEXT: v_subrev_u32_e32 v7, vcc, v18, v7 762; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8 763; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9 764; VI-NEXT: v_add_u32_e32 v17, vcc, 1, v10 765; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v11 766; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 767; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 768; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 769; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 770; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] 771; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4 772; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] 773; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5 774; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] 775; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 776; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] 777; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7 778; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] 779; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 780; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] 781; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v9 782; VI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] 783; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v10 784; VI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] 785; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v11 786; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 787; VI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc 788; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 789; VI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc 790; VI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 791; VI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc 792; VI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 793; VI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc 794; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 795; VI-NEXT: s_endpgm 796; 797; GCN-LABEL: udiv_v4i32: 798; GCN: ; %bb.0: 799; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 800; GCN-NEXT: s_waitcnt lgkmcnt(0) 801; GCN-NEXT: s_add_u32 s4, s2, 16 802; GCN-NEXT: s_addc_u32 s5, s3, 0 803; GCN-NEXT: v_mov_b32_e32 v0, s4 804; GCN-NEXT: v_mov_b32_e32 v1, s5 805; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 806; GCN-NEXT: v_mov_b32_e32 v5, s3 807; GCN-NEXT: v_mov_b32_e32 v4, s2 808; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 809; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe 810; GCN-NEXT: v_mov_b32_e32 v8, s0 811; GCN-NEXT: v_mov_b32_e32 v9, s1 812; GCN-NEXT: s_waitcnt vmcnt(1) 813; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 814; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 815; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2 816; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3 817; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 818; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 819; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 820; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 821; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 822; GCN-NEXT: v_mul_f32_e32 v12, s2, v12 823; GCN-NEXT: v_mul_f32_e32 v14, s2, v14 824; GCN-NEXT: v_mul_f32_e32 v16, s2, v16 825; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 826; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 827; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 828; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 829; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 830; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 831; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 832; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3 833; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 834; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 835; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 836; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 837; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 838; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 839; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 840; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 841; GCN-NEXT: v_add_u32_e32 v10, vcc, v11, v10 842; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 843; GCN-NEXT: v_add_u32_e32 v12, vcc, v15, v14 844; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 845; GCN-NEXT: s_waitcnt vmcnt(0) 846; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 847; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 848; GCN-NEXT: v_mul_hi_u32 v12, v6, v12 849; GCN-NEXT: v_mul_hi_u32 v13, v7, v13 850; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 851; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 852; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 853; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 854; GCN-NEXT: v_subrev_u32_e32 v4, vcc, v14, v4 855; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v16, v5 856; GCN-NEXT: v_subrev_u32_e32 v6, vcc, v18, v6 857; GCN-NEXT: v_subrev_u32_e32 v7, vcc, v19, v7 858; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 859; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 860; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 861; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 862; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 863; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 864; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 865; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 866; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] 867; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v0, v4 868; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] 869; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v1, v5 870; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] 871; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 872; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] 873; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v3, v7 874; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[0:1] 875; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 876; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[2:3] 877; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 878; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] 879; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 880; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[6:7] 881; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 882; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 883; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v15, vcc 884; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 885; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v17, vcc 886; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 887; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v14, vcc 888; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 889; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc 890; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 891; GCN-NEXT: s_endpgm 892; 893; GFX1030-LABEL: udiv_v4i32: 894; GFX1030: ; %bb.0: 895; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 896; GFX1030-NEXT: v_mov_b32_e32 v8, 0 897; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe 898; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 899; GFX1030-NEXT: s_clause 0x1 900; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 901; GFX1030-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] 902; GFX1030-NEXT: s_waitcnt vmcnt(1) 903; GFX1030-NEXT: v_cvt_f32_u32_e32 v9, v0 904; GFX1030-NEXT: v_cvt_f32_u32_e32 v10, v1 905; GFX1030-NEXT: v_cvt_f32_u32_e32 v11, v2 906; GFX1030-NEXT: v_cvt_f32_u32_e32 v12, v3 907; GFX1030-NEXT: v_sub_nc_u32_e32 v13, 0, v0 908; GFX1030-NEXT: v_rcp_iflag_f32_e32 v9, v9 909; GFX1030-NEXT: v_rcp_iflag_f32_e32 v10, v10 910; GFX1030-NEXT: v_rcp_iflag_f32_e32 v11, v11 911; GFX1030-NEXT: v_rcp_iflag_f32_e32 v12, v12 912; GFX1030-NEXT: v_sub_nc_u32_e32 v14, 0, v1 913; GFX1030-NEXT: v_sub_nc_u32_e32 v15, 0, v2 914; GFX1030-NEXT: v_sub_nc_u32_e32 v16, 0, v3 915; GFX1030-NEXT: v_mul_f32_e32 v9, s0, v9 916; GFX1030-NEXT: v_mul_f32_e32 v10, s0, v10 917; GFX1030-NEXT: v_mul_f32_e32 v11, s0, v11 918; GFX1030-NEXT: v_mul_f32_e32 v12, s0, v12 919; GFX1030-NEXT: v_cvt_u32_f32_e32 v9, v9 920; GFX1030-NEXT: v_cvt_u32_f32_e32 v10, v10 921; GFX1030-NEXT: v_cvt_u32_f32_e32 v11, v11 922; GFX1030-NEXT: v_cvt_u32_f32_e32 v12, v12 923; GFX1030-NEXT: v_mul_lo_u32 v13, v13, v9 924; GFX1030-NEXT: v_mul_lo_u32 v14, v14, v10 925; GFX1030-NEXT: v_mul_lo_u32 v15, v15, v11 926; GFX1030-NEXT: v_mul_lo_u32 v16, v16, v12 927; GFX1030-NEXT: v_mul_hi_u32 v13, v9, v13 928; GFX1030-NEXT: v_mul_hi_u32 v14, v10, v14 929; GFX1030-NEXT: v_mul_hi_u32 v15, v11, v15 930; GFX1030-NEXT: v_mul_hi_u32 v16, v12, v16 931; GFX1030-NEXT: v_add_nc_u32_e32 v9, v9, v13 932; GFX1030-NEXT: v_add_nc_u32_e32 v10, v10, v14 933; GFX1030-NEXT: v_add_nc_u32_e32 v11, v11, v15 934; GFX1030-NEXT: v_add_nc_u32_e32 v12, v12, v16 935; GFX1030-NEXT: s_waitcnt vmcnt(0) 936; GFX1030-NEXT: v_mul_hi_u32 v9, v4, v9 937; GFX1030-NEXT: v_mul_hi_u32 v10, v5, v10 938; GFX1030-NEXT: v_mul_hi_u32 v11, v6, v11 939; GFX1030-NEXT: v_mul_hi_u32 v12, v7, v12 940; GFX1030-NEXT: v_mul_lo_u32 v13, v9, v0 941; GFX1030-NEXT: v_mul_lo_u32 v14, v10, v1 942; GFX1030-NEXT: v_mul_lo_u32 v15, v11, v2 943; GFX1030-NEXT: v_mul_lo_u32 v16, v12, v3 944; GFX1030-NEXT: v_add_nc_u32_e32 v17, 1, v9 945; GFX1030-NEXT: v_add_nc_u32_e32 v18, 1, v10 946; GFX1030-NEXT: v_add_nc_u32_e32 v19, 1, v11 947; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v13 948; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v5, v14 949; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v6, v15 950; GFX1030-NEXT: v_sub_nc_u32_e32 v7, v7, v16 951; GFX1030-NEXT: v_add_nc_u32_e32 v13, 1, v12 952; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v0 953; GFX1030-NEXT: v_sub_nc_u32_e32 v14, v4, v0 954; GFX1030-NEXT: v_cmp_ge_u32_e64 s0, v5, v1 955; GFX1030-NEXT: v_sub_nc_u32_e32 v15, v5, v1 956; GFX1030-NEXT: v_cmp_ge_u32_e64 s1, v6, v2 957; GFX1030-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc_lo 958; GFX1030-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo 959; GFX1030-NEXT: v_cndmask_b32_e64 v10, v10, v18, s0 960; GFX1030-NEXT: v_sub_nc_u32_e32 v16, v6, v2 961; GFX1030-NEXT: v_cmp_ge_u32_e64 s2, v7, v3 962; GFX1030-NEXT: v_add_nc_u32_e32 v14, 1, v9 963; GFX1030-NEXT: v_cndmask_b32_e64 v5, v5, v15, s0 964; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v0 965; GFX1030-NEXT: v_cndmask_b32_e64 v11, v11, v19, s1 966; GFX1030-NEXT: v_cndmask_b32_e64 v12, v12, v13, s2 967; GFX1030-NEXT: v_sub_nc_u32_e32 v13, v7, v3 968; GFX1030-NEXT: v_add_nc_u32_e32 v15, 1, v10 969; GFX1030-NEXT: v_cndmask_b32_e64 v6, v6, v16, s1 970; GFX1030-NEXT: v_cndmask_b32_e32 v0, v9, v14, vcc_lo 971; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v1 972; GFX1030-NEXT: v_add_nc_u32_e32 v16, 1, v11 973; GFX1030-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 974; GFX1030-NEXT: v_add_nc_u32_e32 v13, 1, v12 975; GFX1030-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc_lo 976; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v6, v2 977; GFX1030-NEXT: v_cndmask_b32_e32 v2, v11, v16, vcc_lo 978; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v7, v3 979; GFX1030-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc_lo 980; GFX1030-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 981; GFX1030-NEXT: s_endpgm 982; 983; EG-LABEL: udiv_v4i32: 984; EG: ; %bb.0: 985; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 986; EG-NEXT: TEX 1 @6 987; EG-NEXT: ALU 65, @11, KC0[CB0:0-32], KC1[] 988; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 989; EG-NEXT: CF_END 990; EG-NEXT: PAD 991; EG-NEXT: Fetch clause starting at 6: 992; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 993; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 994; EG-NEXT: ALU clause starting at 10: 995; EG-NEXT: MOV * T0.X, KC0[2].Z, 996; EG-NEXT: ALU clause starting at 11: 997; EG-NEXT: SUB_INT T2.W, 0.0, T1.W, 998; EG-NEXT: RECIP_UINT * T2.X, T1.W, 999; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1000; EG-NEXT: MULHI * T2.Y, T2.X, PS, 1001; EG-NEXT: ADD_INT * T2.W, T2.X, PS, 1002; EG-NEXT: MULHI * T2.X, T0.W, PV.W, 1003; EG-NEXT: MULLO_INT * T2.Y, PS, T1.W, 1004; EG-NEXT: SUB_INT T2.W, 0.0, T1.X, 1005; EG-NEXT: RECIP_UINT * T2.Z, T1.X, 1006; EG-NEXT: MULLO_INT * T2.W, PV.W, PS, 1007; EG-NEXT: SUB_INT T3.W, 0.0, T1.Y, 1008; EG-NEXT: RECIP_UINT * T3.X, T1.Y, 1009; EG-NEXT: MULLO_INT * T3.Y, PV.W, PS, 1010; EG-NEXT: SUB_INT T3.W, 0.0, T1.Z, 1011; EG-NEXT: RECIP_UINT * T3.Z, T1.Z, 1012; EG-NEXT: MULLO_INT * T3.W, PV.W, PS, 1013; EG-NEXT: MULHI * T3.W, T3.Z, PS, 1014; EG-NEXT: ADD_INT T3.W, T3.Z, PS, 1015; EG-NEXT: MULHI * T3.Y, T3.X, T3.Y, 1016; EG-NEXT: ADD_INT T4.W, T3.X, PS, 1017; EG-NEXT: MULHI * T3.X, T0.Z, PV.W, 1018; EG-NEXT: MULHI * T3.Y, T0.Y, PV.W, 1019; EG-NEXT: MULLO_INT * T3.Z, PS, T1.Y, 1020; EG-NEXT: SUB_INT T3.W, T0.Y, PS, 1021; EG-NEXT: MULLO_INT * T0.Y, T3.X, T1.Z, 1022; EG-NEXT: SUB_INT T4.X, T0.Z, PS, 1023; EG-NEXT: ADD_INT T0.Y, T3.Y, 1, 1024; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.Y, 1025; EG-NEXT: SUB_INT T4.W, PV.W, T1.Y, 1026; EG-NEXT: MULHI * T2.W, T2.Z, T2.W, 1027; EG-NEXT: CNDE_INT T5.X, PV.Z, T3.W, PV.W, 1028; EG-NEXT: CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122 1029; EG-NEXT: SETGE_UINT T0.Z, PV.X, T1.Z, 1030; EG-NEXT: ADD_INT T2.W, T2.Z, PS, 1031; EG-NEXT: SUB_INT * T0.W, T0.W, T2.Y, 1032; EG-NEXT: ADD_INT T6.X, T3.X, 1, 1033; EG-NEXT: ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212 1034; EG-NEXT: SETGE_UINT T2.Z, PS, T1.W, 1035; EG-NEXT: SUB_INT T3.W, PS, T1.W, 1036; EG-NEXT: MULHI * T2.W, T0.X, PV.W, 1037; EG-NEXT: SUB_INT T7.X, T4.X, T1.Z, 1038; EG-NEXT: CNDE_INT T3.Y, PV.Z, T0.W, PV.W, 1039; EG-NEXT: CNDE_INT T2.Z, PV.Z, T2.X, PV.Y, 1040; EG-NEXT: CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122 1041; EG-NEXT: MULLO_INT * T2.X, T2.W, T1.X, 1042; EG-NEXT: ADD_INT T3.X, T0.W, 1, 1043; EG-NEXT: ADD_INT T2.Y, T2.Z, 1, 1044; EG-NEXT: SETGE_UINT T3.Z, T3.Y, T1.W, 1045; EG-NEXT: SUB_INT T1.W, T0.X, PS, BS:VEC_201 1046; EG-NEXT: CNDE_INT * T3.W, T0.Z, T4.X, T7.X, 1047; EG-NEXT: SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122 1048; EG-NEXT: ADD_INT T3.Y, T2.W, 1, 1049; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.X, 1050; EG-NEXT: SUB_INT T3.W, PV.W, T1.X, 1051; EG-NEXT: CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y, 1052; EG-NEXT: CNDE_INT T2.X, PV.Z, T1.W, PV.W, 1053; EG-NEXT: CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122 1054; EG-NEXT: CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201 1055; EG-NEXT: ADD_INT T0.W, T0.Y, 1, 1056; EG-NEXT: SETGE_UINT * T1.W, T5.X, T1.Y, 1057; EG-NEXT: CNDE_INT T4.Y, PS, T0.Y, PV.W, 1058; EG-NEXT: ADD_INT T0.W, PV.Y, 1, 1059; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.X, 1060; EG-NEXT: CNDE_INT T4.X, PS, T2.Y, PV.W, 1061; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1062; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1063 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 1064 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in 1065 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr 1066 %result = udiv <4 x i32> %a, %b 1067 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 1068 ret void 1069} 1070 1071define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 1072; SI-LABEL: udiv_i32_div_pow2: 1073; SI: ; %bb.0: 1074; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1075; SI-NEXT: s_mov_b32 s7, 0xf000 1076; SI-NEXT: s_mov_b32 s6, -1 1077; SI-NEXT: s_mov_b32 s10, s6 1078; SI-NEXT: s_mov_b32 s11, s7 1079; SI-NEXT: s_waitcnt lgkmcnt(0) 1080; SI-NEXT: s_mov_b32 s8, s2 1081; SI-NEXT: s_mov_b32 s9, s3 1082; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1083; SI-NEXT: s_mov_b32 s4, s0 1084; SI-NEXT: s_mov_b32 s5, s1 1085; SI-NEXT: s_waitcnt vmcnt(0) 1086; SI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 1087; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1088; SI-NEXT: s_endpgm 1089; 1090; VI-LABEL: udiv_i32_div_pow2: 1091; VI: ; %bb.0: 1092; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1093; VI-NEXT: s_mov_b32 s7, 0xf000 1094; VI-NEXT: s_mov_b32 s6, -1 1095; VI-NEXT: s_mov_b32 s10, s6 1096; VI-NEXT: s_mov_b32 s11, s7 1097; VI-NEXT: s_waitcnt lgkmcnt(0) 1098; VI-NEXT: s_mov_b32 s8, s2 1099; VI-NEXT: s_mov_b32 s9, s3 1100; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1101; VI-NEXT: s_mov_b32 s4, s0 1102; VI-NEXT: s_mov_b32 s5, s1 1103; VI-NEXT: s_waitcnt vmcnt(0) 1104; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 1105; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1106; VI-NEXT: s_endpgm 1107; 1108; GCN-LABEL: udiv_i32_div_pow2: 1109; GCN: ; %bb.0: 1110; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1111; GCN-NEXT: s_waitcnt lgkmcnt(0) 1112; GCN-NEXT: v_mov_b32_e32 v0, s2 1113; GCN-NEXT: v_mov_b32_e32 v1, s3 1114; GCN-NEXT: flat_load_dword v2, v[0:1] 1115; GCN-NEXT: v_mov_b32_e32 v0, s0 1116; GCN-NEXT: v_mov_b32_e32 v1, s1 1117; GCN-NEXT: s_waitcnt vmcnt(0) 1118; GCN-NEXT: v_lshrrev_b32_e32 v2, 4, v2 1119; GCN-NEXT: flat_store_dword v[0:1], v2 1120; GCN-NEXT: s_endpgm 1121; 1122; GFX1030-LABEL: udiv_i32_div_pow2: 1123; GFX1030: ; %bb.0: 1124; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1125; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1126; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1128; GFX1030-NEXT: s_waitcnt vmcnt(0) 1129; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 4, v1 1130; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1131; GFX1030-NEXT: s_endpgm 1132; 1133; EG-LABEL: udiv_i32_div_pow2: 1134; EG: ; %bb.0: 1135; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1136; EG-NEXT: TEX 0 @6 1137; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1138; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1139; EG-NEXT: CF_END 1140; EG-NEXT: PAD 1141; EG-NEXT: Fetch clause starting at 6: 1142; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1143; EG-NEXT: ALU clause starting at 8: 1144; EG-NEXT: MOV * T0.X, KC0[2].Z, 1145; EG-NEXT: ALU clause starting at 9: 1146; EG-NEXT: LSHR T0.X, T0.X, literal.x, 1147; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1148; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 1149 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 1150 %a = load i32, i32 addrspace(1)* %in 1151 %result = udiv i32 %a, 16 1152 store i32 %result, i32 addrspace(1)* %out 1153 ret void 1154} 1155 1156define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 1157; SI-LABEL: udiv_i32_div_k_even: 1158; SI: ; %bb.0: 1159; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1160; SI-NEXT: s_mov_b32 s7, 0xf000 1161; SI-NEXT: s_mov_b32 s6, -1 1162; SI-NEXT: s_mov_b32 s10, s6 1163; SI-NEXT: s_mov_b32 s11, s7 1164; SI-NEXT: s_waitcnt lgkmcnt(0) 1165; SI-NEXT: s_mov_b32 s8, s2 1166; SI-NEXT: s_mov_b32 s9, s3 1167; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1168; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1 1169; SI-NEXT: s_mov_b32 s4, s0 1170; SI-NEXT: s_mov_b32 s5, s1 1171; SI-NEXT: s_waitcnt vmcnt(0) 1172; SI-NEXT: v_mul_hi_u32 v0, v0, s2 1173; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 1174; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1175; SI-NEXT: s_endpgm 1176; 1177; VI-LABEL: udiv_i32_div_k_even: 1178; VI: ; %bb.0: 1179; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1180; VI-NEXT: s_mov_b32 s7, 0xf000 1181; VI-NEXT: s_mov_b32 s6, -1 1182; VI-NEXT: s_mov_b32 s10, s6 1183; VI-NEXT: s_mov_b32 s11, s7 1184; VI-NEXT: s_waitcnt lgkmcnt(0) 1185; VI-NEXT: s_mov_b32 s8, s2 1186; VI-NEXT: s_mov_b32 s9, s3 1187; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1188; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1 1189; VI-NEXT: s_mov_b32 s4, s0 1190; VI-NEXT: s_mov_b32 s5, s1 1191; VI-NEXT: s_waitcnt vmcnt(0) 1192; VI-NEXT: v_mul_hi_u32 v0, v0, s2 1193; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 1194; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1195; VI-NEXT: s_endpgm 1196; 1197; GCN-LABEL: udiv_i32_div_k_even: 1198; GCN: ; %bb.0: 1199; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1200; GCN-NEXT: s_waitcnt lgkmcnt(0) 1201; GCN-NEXT: v_mov_b32_e32 v0, s2 1202; GCN-NEXT: v_mov_b32_e32 v1, s3 1203; GCN-NEXT: flat_load_dword v0, v[0:1] 1204; GCN-NEXT: s_mov_b32 s2, 0xfabbd9c1 1205; GCN-NEXT: v_mov_b32_e32 v1, s1 1206; GCN-NEXT: s_waitcnt vmcnt(0) 1207; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 1208; GCN-NEXT: v_mov_b32_e32 v0, s0 1209; GCN-NEXT: v_lshrrev_b32_e32 v2, 25, v2 1210; GCN-NEXT: flat_store_dword v[0:1], v2 1211; GCN-NEXT: s_endpgm 1212; 1213; GFX1030-LABEL: udiv_i32_div_k_even: 1214; GFX1030: ; %bb.0: 1215; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1216; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1217; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1218; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1219; GFX1030-NEXT: s_waitcnt vmcnt(0) 1220; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9c1, v1 1221; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 25, v1 1222; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1223; GFX1030-NEXT: s_endpgm 1224; 1225; EG-LABEL: udiv_i32_div_k_even: 1226; EG: ; %bb.0: 1227; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1228; EG-NEXT: TEX 0 @6 1229; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1230; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1231; EG-NEXT: CF_END 1232; EG-NEXT: PAD 1233; EG-NEXT: Fetch clause starting at 6: 1234; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1235; EG-NEXT: ALU clause starting at 8: 1236; EG-NEXT: MOV * T0.X, KC0[2].Z, 1237; EG-NEXT: ALU clause starting at 9: 1238; EG-NEXT: MULHI * T0.X, T0.X, literal.x, 1239; EG-NEXT: -88352319(-4.876880e+35), 0(0.000000e+00) 1240; EG-NEXT: LSHR T0.X, PS, literal.x, 1241; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1242; EG-NEXT: 25(3.503246e-44), 2(2.802597e-45) 1243 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 1244 %a = load i32, i32 addrspace(1)* %in 1245 %result = udiv i32 %a, 34259182 1246 store i32 %result, i32 addrspace(1)* %out 1247 ret void 1248} 1249 1250define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 1251; SI-LABEL: udiv_i32_div_k_odd: 1252; SI: ; %bb.0: 1253; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1254; SI-NEXT: s_mov_b32 s7, 0xf000 1255; SI-NEXT: s_mov_b32 s6, -1 1256; SI-NEXT: s_mov_b32 s10, s6 1257; SI-NEXT: s_mov_b32 s11, s7 1258; SI-NEXT: s_waitcnt lgkmcnt(0) 1259; SI-NEXT: s_mov_b32 s8, s2 1260; SI-NEXT: s_mov_b32 s9, s3 1261; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1262; SI-NEXT: s_mov_b32 s2, 0x7d5deca3 1263; SI-NEXT: s_mov_b32 s4, s0 1264; SI-NEXT: s_mov_b32 s5, s1 1265; SI-NEXT: s_waitcnt vmcnt(0) 1266; SI-NEXT: v_mul_hi_u32 v0, v0, s2 1267; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1268; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1269; SI-NEXT: s_endpgm 1270; 1271; VI-LABEL: udiv_i32_div_k_odd: 1272; VI: ; %bb.0: 1273; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1274; VI-NEXT: s_mov_b32 s7, 0xf000 1275; VI-NEXT: s_mov_b32 s6, -1 1276; VI-NEXT: s_mov_b32 s10, s6 1277; VI-NEXT: s_mov_b32 s11, s7 1278; VI-NEXT: s_waitcnt lgkmcnt(0) 1279; VI-NEXT: s_mov_b32 s8, s2 1280; VI-NEXT: s_mov_b32 s9, s3 1281; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1282; VI-NEXT: s_mov_b32 s2, 0x7d5deca3 1283; VI-NEXT: s_mov_b32 s4, s0 1284; VI-NEXT: s_mov_b32 s5, s1 1285; VI-NEXT: s_waitcnt vmcnt(0) 1286; VI-NEXT: v_mul_hi_u32 v0, v0, s2 1287; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1288; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1289; VI-NEXT: s_endpgm 1290; 1291; GCN-LABEL: udiv_i32_div_k_odd: 1292; GCN: ; %bb.0: 1293; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1294; GCN-NEXT: s_waitcnt lgkmcnt(0) 1295; GCN-NEXT: v_mov_b32_e32 v0, s2 1296; GCN-NEXT: v_mov_b32_e32 v1, s3 1297; GCN-NEXT: flat_load_dword v0, v[0:1] 1298; GCN-NEXT: s_mov_b32 s2, 0x7d5deca3 1299; GCN-NEXT: v_mov_b32_e32 v1, s1 1300; GCN-NEXT: s_waitcnt vmcnt(0) 1301; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 1302; GCN-NEXT: v_mov_b32_e32 v0, s0 1303; GCN-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1304; GCN-NEXT: flat_store_dword v[0:1], v2 1305; GCN-NEXT: s_endpgm 1306; 1307; GFX1030-LABEL: udiv_i32_div_k_odd: 1308; GFX1030: ; %bb.0: 1309; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1310; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1311; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1312; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1313; GFX1030-NEXT: s_waitcnt vmcnt(0) 1314; GFX1030-NEXT: v_mul_hi_u32 v1, 0x7d5deca3, v1 1315; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1316; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1317; GFX1030-NEXT: s_endpgm 1318; 1319; EG-LABEL: udiv_i32_div_k_odd: 1320; EG: ; %bb.0: 1321; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1322; EG-NEXT: TEX 0 @6 1323; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1324; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1325; EG-NEXT: CF_END 1326; EG-NEXT: PAD 1327; EG-NEXT: Fetch clause starting at 6: 1328; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1329; EG-NEXT: ALU clause starting at 8: 1330; EG-NEXT: MOV * T0.X, KC0[2].Z, 1331; EG-NEXT: ALU clause starting at 9: 1332; EG-NEXT: MULHI * T0.X, T0.X, literal.x, 1333; EG-NEXT: 2103307427(1.843675e+37), 0(0.000000e+00) 1334; EG-NEXT: LSHR T0.X, PS, literal.x, 1335; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1336; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45) 1337 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 1338 %a = load i32, i32 addrspace(1)* %in 1339 %result = udiv i32 %a, 34259183 1340 store i32 %result, i32 addrspace(1)* %out 1341 ret void 1342} 1343 1344define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { 1345; SI-LABEL: v_udiv_i8: 1346; SI: ; %bb.0: 1347; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1348; SI-NEXT: s_mov_b32 s7, 0xf000 1349; SI-NEXT: s_mov_b32 s6, -1 1350; SI-NEXT: s_mov_b32 s10, s6 1351; SI-NEXT: s_mov_b32 s11, s7 1352; SI-NEXT: s_waitcnt lgkmcnt(0) 1353; SI-NEXT: s_mov_b32 s8, s2 1354; SI-NEXT: s_mov_b32 s9, s3 1355; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1356; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 1357; SI-NEXT: s_mov_b32 s4, s0 1358; SI-NEXT: s_mov_b32 s5, s1 1359; SI-NEXT: s_waitcnt vmcnt(1) 1360; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1361; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1362; SI-NEXT: s_waitcnt vmcnt(0) 1363; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1364; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1365; SI-NEXT: v_trunc_f32_e32 v2, v2 1366; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1367; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1368; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1369; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1370; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1371; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1372; SI-NEXT: s_endpgm 1373; 1374; VI-LABEL: v_udiv_i8: 1375; VI: ; %bb.0: 1376; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1377; VI-NEXT: s_mov_b32 s7, 0xf000 1378; VI-NEXT: s_mov_b32 s6, -1 1379; VI-NEXT: s_mov_b32 s10, s6 1380; VI-NEXT: s_mov_b32 s11, s7 1381; VI-NEXT: s_waitcnt lgkmcnt(0) 1382; VI-NEXT: s_mov_b32 s8, s2 1383; VI-NEXT: s_mov_b32 s9, s3 1384; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1385; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 1386; VI-NEXT: s_mov_b32 s4, s0 1387; VI-NEXT: s_mov_b32 s5, s1 1388; VI-NEXT: s_waitcnt vmcnt(1) 1389; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1390; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1391; VI-NEXT: s_waitcnt vmcnt(0) 1392; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1393; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1394; VI-NEXT: v_trunc_f32_e32 v2, v2 1395; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1396; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1397; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1398; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1399; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1400; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1401; VI-NEXT: s_endpgm 1402; 1403; GCN-LABEL: v_udiv_i8: 1404; GCN: ; %bb.0: 1405; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1406; GCN-NEXT: s_waitcnt lgkmcnt(0) 1407; GCN-NEXT: v_mov_b32_e32 v0, s2 1408; GCN-NEXT: v_mov_b32_e32 v1, s3 1409; GCN-NEXT: flat_load_ushort v2, v[0:1] 1410; GCN-NEXT: v_mov_b32_e32 v0, s0 1411; GCN-NEXT: v_mov_b32_e32 v1, s1 1412; GCN-NEXT: s_waitcnt vmcnt(0) 1413; GCN-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 1414; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 1415; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1416; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 1417; GCN-NEXT: v_trunc_f32_e32 v4, v4 1418; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1419; GCN-NEXT: v_mad_f32 v2, -v4, v3, v2 1420; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 1421; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1422; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 1423; GCN-NEXT: flat_store_dword v[0:1], v2 1424; GCN-NEXT: s_endpgm 1425; 1426; GFX1030-LABEL: v_udiv_i8: 1427; GFX1030: ; %bb.0: 1428; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1429; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1430; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1431; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] 1432; GFX1030-NEXT: s_waitcnt vmcnt(0) 1433; GFX1030-NEXT: v_cvt_f32_ubyte1_e32 v2, v1 1434; GFX1030-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1435; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 1436; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 1437; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1438; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 1439; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1440; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2 1441; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1442; GFX1030-NEXT: v_and_b32_e32 v1, 0xff, v1 1443; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1444; GFX1030-NEXT: s_endpgm 1445; 1446; EG-LABEL: v_udiv_i8: 1447; EG: ; %bb.0: 1448; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1449; EG-NEXT: TEX 1 @6 1450; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 1451; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1452; EG-NEXT: CF_END 1453; EG-NEXT: PAD 1454; EG-NEXT: Fetch clause starting at 6: 1455; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 1456; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1457; EG-NEXT: ALU clause starting at 10: 1458; EG-NEXT: MOV * T0.X, KC0[2].Z, 1459; EG-NEXT: ALU clause starting at 11: 1460; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, 1461; EG-NEXT: RECIP_IEEE * T0.Z, PS, 1462; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, 1463; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, 1464; EG-NEXT: TRUNC * T0.W, PV.W, 1465; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, 1466; EG-NEXT: TRUNC * T0.W, PV.W, 1467; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, 1468; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1469; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1470; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1471; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1472; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1473; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1474; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) 1475 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 1476 %num = load i8, i8 addrspace(1) * %in 1477 %den = load i8, i8 addrspace(1) * %den_ptr 1478 %result = udiv i8 %num, %den 1479 %result.ext = zext i8 %result to i32 1480 store i32 %result.ext, i32 addrspace(1)* %out 1481 ret void 1482} 1483 1484define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { 1485; SI-LABEL: v_udiv_i16: 1486; SI: ; %bb.0: 1487; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1488; SI-NEXT: s_mov_b32 s7, 0xf000 1489; SI-NEXT: s_mov_b32 s6, -1 1490; SI-NEXT: s_mov_b32 s10, s6 1491; SI-NEXT: s_mov_b32 s11, s7 1492; SI-NEXT: s_waitcnt lgkmcnt(0) 1493; SI-NEXT: s_mov_b32 s8, s2 1494; SI-NEXT: s_mov_b32 s9, s3 1495; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1496; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 1497; SI-NEXT: s_mov_b32 s4, s0 1498; SI-NEXT: s_mov_b32 s5, s1 1499; SI-NEXT: s_waitcnt vmcnt(1) 1500; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1501; SI-NEXT: s_waitcnt vmcnt(0) 1502; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 1503; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1504; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1505; SI-NEXT: v_trunc_f32_e32 v2, v2 1506; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1507; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1508; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1509; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1510; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1511; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1512; SI-NEXT: s_endpgm 1513; 1514; VI-LABEL: v_udiv_i16: 1515; VI: ; %bb.0: 1516; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1517; VI-NEXT: s_mov_b32 s7, 0xf000 1518; VI-NEXT: s_mov_b32 s6, -1 1519; VI-NEXT: s_mov_b32 s10, s6 1520; VI-NEXT: s_mov_b32 s11, s7 1521; VI-NEXT: s_waitcnt lgkmcnt(0) 1522; VI-NEXT: s_mov_b32 s8, s2 1523; VI-NEXT: s_mov_b32 s9, s3 1524; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1525; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 1526; VI-NEXT: s_mov_b32 s4, s0 1527; VI-NEXT: s_mov_b32 s5, s1 1528; VI-NEXT: s_waitcnt vmcnt(1) 1529; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1530; VI-NEXT: s_waitcnt vmcnt(0) 1531; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 1532; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1533; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1534; VI-NEXT: v_trunc_f32_e32 v2, v2 1535; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1536; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1537; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1538; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1539; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1540; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1541; VI-NEXT: s_endpgm 1542; 1543; GCN-LABEL: v_udiv_i16: 1544; GCN: ; %bb.0: 1545; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1546; GCN-NEXT: s_waitcnt lgkmcnt(0) 1547; GCN-NEXT: s_add_u32 s4, s2, 2 1548; GCN-NEXT: s_addc_u32 s5, s3, 0 1549; GCN-NEXT: v_mov_b32_e32 v0, s4 1550; GCN-NEXT: v_mov_b32_e32 v1, s5 1551; GCN-NEXT: flat_load_ushort v2, v[0:1] 1552; GCN-NEXT: v_mov_b32_e32 v0, s2 1553; GCN-NEXT: v_mov_b32_e32 v1, s3 1554; GCN-NEXT: flat_load_ushort v0, v[0:1] 1555; GCN-NEXT: v_mov_b32_e32 v1, s1 1556; GCN-NEXT: s_waitcnt vmcnt(1) 1557; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 1558; GCN-NEXT: s_waitcnt vmcnt(0) 1559; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 1560; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1561; GCN-NEXT: v_mov_b32_e32 v0, s0 1562; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 1563; GCN-NEXT: v_trunc_f32_e32 v4, v4 1564; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1565; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 1566; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 1567; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1568; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 1569; GCN-NEXT: flat_store_dword v[0:1], v2 1570; GCN-NEXT: s_endpgm 1571; 1572; GFX1030-LABEL: v_udiv_i16: 1573; GFX1030: ; %bb.0: 1574; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1575; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1576; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1577; GFX1030-NEXT: s_clause 0x1 1578; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] offset:2 1579; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] 1580; GFX1030-NEXT: s_waitcnt vmcnt(1) 1581; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 1582; GFX1030-NEXT: s_waitcnt vmcnt(0) 1583; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 1584; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 1585; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 1586; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1587; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2 1588; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1589; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1 1590; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1591; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1 1592; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1593; GFX1030-NEXT: s_endpgm 1594; 1595; EG-LABEL: v_udiv_i16: 1596; EG: ; %bb.0: 1597; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1598; EG-NEXT: TEX 1 @6 1599; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 1600; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1601; EG-NEXT: CF_END 1602; EG-NEXT: PAD 1603; EG-NEXT: Fetch clause starting at 6: 1604; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1605; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1606; EG-NEXT: ALU clause starting at 10: 1607; EG-NEXT: MOV * T0.X, KC0[2].Z, 1608; EG-NEXT: ALU clause starting at 11: 1609; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, 1610; EG-NEXT: RECIP_IEEE * T0.Z, PS, 1611; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, 1612; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, 1613; EG-NEXT: TRUNC * T0.W, PV.W, 1614; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, 1615; EG-NEXT: TRUNC * T0.W, PV.W, 1616; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, 1617; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1618; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1619; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1620; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1621; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1622; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1623; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 1624 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 1625 %num = load i16, i16 addrspace(1) * %in 1626 %den = load i16, i16 addrspace(1) * %den_ptr 1627 %result = udiv i16 %num, %den 1628 %result.ext = zext i16 %result to i32 1629 store i32 %result.ext, i32 addrspace(1)* %out 1630 ret void 1631} 1632 1633define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) { 1634; SI-LABEL: v_udiv_i23: 1635; SI: ; %bb.0: 1636; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1637; SI-NEXT: s_mov_b32 s7, 0xf000 1638; SI-NEXT: s_mov_b32 s6, -1 1639; SI-NEXT: s_mov_b32 s10, s6 1640; SI-NEXT: s_mov_b32 s11, s7 1641; SI-NEXT: s_waitcnt lgkmcnt(0) 1642; SI-NEXT: s_mov_b32 s8, s2 1643; SI-NEXT: s_mov_b32 s9, s3 1644; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1645; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1646; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1647; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1648; SI-NEXT: s_mov_b32 s4, s0 1649; SI-NEXT: s_mov_b32 s5, s1 1650; SI-NEXT: s_waitcnt vmcnt(3) 1651; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1652; SI-NEXT: s_waitcnt vmcnt(2) 1653; SI-NEXT: v_or_b32_e32 v0, v1, v0 1654; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1655; SI-NEXT: s_waitcnt vmcnt(1) 1656; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1657; SI-NEXT: s_waitcnt vmcnt(0) 1658; SI-NEXT: v_or_b32_e32 v1, v3, v1 1659; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 1660; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1661; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1662; SI-NEXT: v_trunc_f32_e32 v2, v2 1663; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1664; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1665; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1666; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1667; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 1668; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1669; SI-NEXT: s_endpgm 1670; 1671; VI-LABEL: v_udiv_i23: 1672; VI: ; %bb.0: 1673; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1674; VI-NEXT: s_mov_b32 s7, 0xf000 1675; VI-NEXT: s_mov_b32 s6, -1 1676; VI-NEXT: s_mov_b32 s10, s6 1677; VI-NEXT: s_mov_b32 s11, s7 1678; VI-NEXT: s_waitcnt lgkmcnt(0) 1679; VI-NEXT: s_mov_b32 s8, s2 1680; VI-NEXT: s_mov_b32 s9, s3 1681; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1682; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1683; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1684; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1685; VI-NEXT: s_mov_b32 s4, s0 1686; VI-NEXT: s_mov_b32 s5, s1 1687; VI-NEXT: s_waitcnt vmcnt(3) 1688; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1689; VI-NEXT: s_waitcnt vmcnt(2) 1690; VI-NEXT: v_or_b32_e32 v0, v1, v0 1691; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1692; VI-NEXT: s_waitcnt vmcnt(1) 1693; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1694; VI-NEXT: s_waitcnt vmcnt(0) 1695; VI-NEXT: v_or_b32_e32 v1, v3, v1 1696; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 1697; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1698; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1699; VI-NEXT: v_trunc_f32_e32 v2, v2 1700; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1701; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1702; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1703; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1704; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 1705; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1706; VI-NEXT: s_endpgm 1707; 1708; GCN-LABEL: v_udiv_i23: 1709; GCN: ; %bb.0: 1710; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1711; GCN-NEXT: s_waitcnt lgkmcnt(0) 1712; GCN-NEXT: s_add_u32 s4, s2, 4 1713; GCN-NEXT: s_addc_u32 s5, s3, 0 1714; GCN-NEXT: s_add_u32 s6, s2, 2 1715; GCN-NEXT: s_addc_u32 s7, s3, 0 1716; GCN-NEXT: v_mov_b32_e32 v0, s6 1717; GCN-NEXT: v_mov_b32_e32 v1, s7 1718; GCN-NEXT: s_add_u32 s6, s2, 6 1719; GCN-NEXT: s_addc_u32 s7, s3, 0 1720; GCN-NEXT: v_mov_b32_e32 v2, s6 1721; GCN-NEXT: v_mov_b32_e32 v3, s7 1722; GCN-NEXT: v_mov_b32_e32 v4, s4 1723; GCN-NEXT: v_mov_b32_e32 v5, s5 1724; GCN-NEXT: flat_load_ubyte v6, v[2:3] 1725; GCN-NEXT: flat_load_ushort v4, v[4:5] 1726; GCN-NEXT: v_mov_b32_e32 v2, s2 1727; GCN-NEXT: v_mov_b32_e32 v3, s3 1728; GCN-NEXT: flat_load_ubyte v0, v[0:1] 1729; GCN-NEXT: flat_load_ushort v1, v[2:3] 1730; GCN-NEXT: s_waitcnt vmcnt(3) 1731; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 1732; GCN-NEXT: s_waitcnt vmcnt(2) 1733; GCN-NEXT: v_or_b32_e32 v2, v4, v2 1734; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 1735; GCN-NEXT: s_waitcnt vmcnt(1) 1736; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1737; GCN-NEXT: s_waitcnt vmcnt(0) 1738; GCN-NEXT: v_or_b32_e32 v0, v1, v0 1739; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 1740; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1741; GCN-NEXT: v_mov_b32_e32 v0, s0 1742; GCN-NEXT: v_mov_b32_e32 v1, s1 1743; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 1744; GCN-NEXT: v_trunc_f32_e32 v4, v4 1745; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1746; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 1747; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 1748; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1749; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 1750; GCN-NEXT: flat_store_dword v[0:1], v2 1751; GCN-NEXT: s_endpgm 1752; 1753; GFX1030-LABEL: v_udiv_i23: 1754; GFX1030: ; %bb.0: 1755; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1756; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1757; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX1030-NEXT: s_clause 0x3 1759; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 1760; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 1761; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 1762; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3] 1763; GFX1030-NEXT: s_waitcnt vmcnt(3) 1764; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1765; GFX1030-NEXT: s_waitcnt vmcnt(2) 1766; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 1767; GFX1030-NEXT: s_waitcnt vmcnt(1) 1768; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1769; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 1770; GFX1030-NEXT: s_waitcnt vmcnt(0) 1771; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2 1772; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 1773; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 1774; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 1775; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1776; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2 1777; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1778; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1 1779; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1780; GFX1030-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 1781; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1782; GFX1030-NEXT: s_endpgm 1783; 1784; EG-LABEL: v_udiv_i23: 1785; EG: ; %bb.0: 1786; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1787; EG-NEXT: TEX 3 @6 1788; EG-NEXT: ALU 20, @15, KC0[CB0:0-32], KC1[] 1789; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1790; EG-NEXT: CF_END 1791; EG-NEXT: PAD 1792; EG-NEXT: Fetch clause starting at 6: 1793; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1794; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1795; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1796; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1797; EG-NEXT: ALU clause starting at 14: 1798; EG-NEXT: MOV * T0.X, KC0[2].Z, 1799; EG-NEXT: ALU clause starting at 15: 1800; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1801; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1802; EG-NEXT: OR_INT T0.W, T0.X, PV.W, 1803; EG-NEXT: LSHL * T1.W, T3.X, literal.x, 1804; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1805; EG-NEXT: UINT_TO_FLT * T0.X, PV.W, 1806; EG-NEXT: OR_INT T0.W, T2.X, T1.W, 1807; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1808; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W, 1809; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Y, 1810; EG-NEXT: TRUNC * T0.W, PV.W, 1811; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z, 1812; EG-NEXT: TRUNC * T0.W, PV.W, 1813; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.X|, 1814; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1815; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1816; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1817; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1818; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1819; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1820; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45) 1821 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1 1822 %num = load i23, i23 addrspace(1) * %in 1823 %den = load i23, i23 addrspace(1) * %den_ptr 1824 %result = udiv i23 %num, %den 1825 %result.ext = zext i23 %result to i32 1826 store i32 %result.ext, i32 addrspace(1)* %out 1827 ret void 1828} 1829 1830define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { 1831; SI-LABEL: v_udiv_i24: 1832; SI: ; %bb.0: 1833; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1834; SI-NEXT: s_mov_b32 s7, 0xf000 1835; SI-NEXT: s_mov_b32 s6, -1 1836; SI-NEXT: s_mov_b32 s10, s6 1837; SI-NEXT: s_mov_b32 s11, s7 1838; SI-NEXT: s_waitcnt lgkmcnt(0) 1839; SI-NEXT: s_mov_b32 s8, s2 1840; SI-NEXT: s_mov_b32 s9, s3 1841; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1842; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1843; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1844; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1845; SI-NEXT: s_mov_b32 s4, s0 1846; SI-NEXT: s_mov_b32 s5, s1 1847; SI-NEXT: s_waitcnt vmcnt(3) 1848; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1849; SI-NEXT: s_waitcnt vmcnt(2) 1850; SI-NEXT: v_or_b32_e32 v0, v1, v0 1851; SI-NEXT: v_cvt_f32_u32_e32 v1, v0 1852; SI-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 1853; SI-NEXT: s_waitcnt vmcnt(1) 1854; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1855; SI-NEXT: v_rcp_iflag_f32_e32 v1, v1 1856; SI-NEXT: s_waitcnt vmcnt(0) 1857; SI-NEXT: v_or_b32_e32 v2, v3, v2 1858; SI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1859; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 1860; SI-NEXT: v_mul_lo_u32 v4, v4, v1 1861; SI-NEXT: v_mul_hi_u32 v4, v1, v4 1862; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1863; SI-NEXT: v_mul_hi_u32 v1, v2, v1 1864; SI-NEXT: v_mul_lo_u32 v3, v1, v0 1865; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1866; SI-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 1867; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 1868; SI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1869; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 1870; SI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1871; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v1 1872; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 1873; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 1874; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 1875; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1876; SI-NEXT: s_endpgm 1877; 1878; VI-LABEL: v_udiv_i24: 1879; VI: ; %bb.0: 1880; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1881; VI-NEXT: s_mov_b32 s7, 0xf000 1882; VI-NEXT: s_mov_b32 s6, -1 1883; VI-NEXT: s_mov_b32 s10, s6 1884; VI-NEXT: s_mov_b32 s11, s7 1885; VI-NEXT: s_waitcnt lgkmcnt(0) 1886; VI-NEXT: s_mov_b32 s8, s2 1887; VI-NEXT: s_mov_b32 s9, s3 1888; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1889; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1890; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1891; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1892; VI-NEXT: s_mov_b32 s4, s0 1893; VI-NEXT: s_mov_b32 s5, s1 1894; VI-NEXT: s_waitcnt vmcnt(3) 1895; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1896; VI-NEXT: s_waitcnt vmcnt(2) 1897; VI-NEXT: v_or_b32_e32 v0, v1, v0 1898; VI-NEXT: v_cvt_f32_u32_e32 v1, v0 1899; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v0 1900; VI-NEXT: s_waitcnt vmcnt(1) 1901; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1902; VI-NEXT: v_rcp_iflag_f32_e32 v1, v1 1903; VI-NEXT: s_waitcnt vmcnt(0) 1904; VI-NEXT: v_or_b32_e32 v2, v3, v2 1905; VI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1906; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 1907; VI-NEXT: v_mul_lo_u32 v4, v4, v1 1908; VI-NEXT: v_mul_hi_u32 v4, v1, v4 1909; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 1910; VI-NEXT: v_mul_hi_u32 v1, v2, v1 1911; VI-NEXT: v_mul_lo_u32 v3, v1, v0 1912; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 1913; VI-NEXT: v_subrev_u32_e32 v2, vcc, v3, v2 1914; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 1915; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1916; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 1917; VI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1918; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v1 1919; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 1920; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 1921; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 1922; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1923; VI-NEXT: s_endpgm 1924; 1925; GCN-LABEL: v_udiv_i24: 1926; GCN: ; %bb.0: 1927; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1928; GCN-NEXT: s_waitcnt lgkmcnt(0) 1929; GCN-NEXT: s_add_u32 s4, s2, 4 1930; GCN-NEXT: s_addc_u32 s5, s3, 0 1931; GCN-NEXT: s_add_u32 s6, s2, 2 1932; GCN-NEXT: v_mov_b32_e32 v0, s4 1933; GCN-NEXT: s_addc_u32 s7, s3, 0 1934; GCN-NEXT: v_mov_b32_e32 v1, s5 1935; GCN-NEXT: s_add_u32 s4, s2, 6 1936; GCN-NEXT: s_addc_u32 s5, s3, 0 1937; GCN-NEXT: v_mov_b32_e32 v2, s4 1938; GCN-NEXT: v_mov_b32_e32 v3, s5 1939; GCN-NEXT: flat_load_ubyte v4, v[2:3] 1940; GCN-NEXT: flat_load_ushort v5, v[0:1] 1941; GCN-NEXT: v_mov_b32_e32 v2, s6 1942; GCN-NEXT: v_mov_b32_e32 v0, s2 1943; GCN-NEXT: v_mov_b32_e32 v3, s7 1944; GCN-NEXT: v_mov_b32_e32 v1, s3 1945; GCN-NEXT: flat_load_ubyte v2, v[2:3] 1946; GCN-NEXT: flat_load_ushort v0, v[0:1] 1947; GCN-NEXT: s_waitcnt vmcnt(3) 1948; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 1949; GCN-NEXT: s_waitcnt vmcnt(2) 1950; GCN-NEXT: v_or_b32_e32 v3, v5, v1 1951; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3 1952; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 1953; GCN-NEXT: s_waitcnt vmcnt(1) 1954; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1955; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1956; GCN-NEXT: s_waitcnt vmcnt(0) 1957; GCN-NEXT: v_or_b32_e32 v2, v0, v2 1958; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1959; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1960; GCN-NEXT: v_mul_lo_u32 v4, v4, v1 1961; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 1962; GCN-NEXT: v_add_u32_e32 v0, vcc, v4, v1 1963; GCN-NEXT: v_mul_hi_u32 v4, v2, v0 1964; GCN-NEXT: v_mov_b32_e32 v0, s0 1965; GCN-NEXT: v_mov_b32_e32 v1, s1 1966; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 1967; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 1968; GCN-NEXT: v_subrev_u32_e32 v2, vcc, v5, v2 1969; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v3 1970; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 1971; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 1972; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1973; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 1974; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 1975; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 1976; GCN-NEXT: v_and_b32_e32 v2, 0xffffff, v2 1977; GCN-NEXT: flat_store_dword v[0:1], v2 1978; GCN-NEXT: s_endpgm 1979; 1980; GFX1030-LABEL: v_udiv_i24: 1981; GFX1030: ; %bb.0: 1982; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1983; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1984; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX1030-NEXT: s_clause 0x3 1986; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 1987; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 1988; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 1989; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3] 1990; GFX1030-NEXT: s_waitcnt vmcnt(3) 1991; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1992; GFX1030-NEXT: s_waitcnt vmcnt(1) 1993; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1994; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 1995; GFX1030-NEXT: s_waitcnt vmcnt(0) 1996; GFX1030-NEXT: v_or_b32_e32 v3, v4, v3 1997; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v1 1998; GFX1030-NEXT: v_sub_nc_u32_e32 v5, 0, v1 1999; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2 2000; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2001; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2 2002; GFX1030-NEXT: v_mul_lo_u32 v5, v5, v2 2003; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v5 2004; GFX1030-NEXT: v_add_nc_u32_e32 v2, v2, v5 2005; GFX1030-NEXT: v_mul_hi_u32 v2, v3, v2 2006; GFX1030-NEXT: v_mul_lo_u32 v4, v2, v1 2007; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v4 2008; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v2 2009; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v3, v1 2010; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v3, v1 2011; GFX1030-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo 2012; GFX1030-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo 2013; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v2 2014; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v3, v1 2015; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 2016; GFX1030-NEXT: v_and_b32_e32 v1, 0xffffff, v1 2017; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 2018; GFX1030-NEXT: s_endpgm 2019; 2020; EG-LABEL: v_udiv_i24: 2021; EG: ; %bb.0: 2022; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 2023; EG-NEXT: TEX 3 @6 2024; EG-NEXT: ALU 23, @15, KC0[CB0:0-32], KC1[] 2025; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2026; EG-NEXT: CF_END 2027; EG-NEXT: PAD 2028; EG-NEXT: Fetch clause starting at 6: 2029; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 2030; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 2031; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 2032; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 2033; EG-NEXT: ALU clause starting at 14: 2034; EG-NEXT: MOV * T0.X, KC0[2].Z, 2035; EG-NEXT: ALU clause starting at 15: 2036; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 2037; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2038; EG-NEXT: OR_INT * T0.W, T0.X, PV.W, 2039; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, 2040; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2041; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS, 2042; EG-NEXT: LSHL T1.W, T3.X, literal.x, 2043; EG-NEXT: MULHI * T0.Y, T0.X, PS, 2044; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2045; EG-NEXT: ADD_INT T2.W, T0.X, PS, 2046; EG-NEXT: OR_INT * T1.W, T2.X, PV.W, 2047; EG-NEXT: MULHI * T0.X, PS, PV.W, 2048; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2049; EG-NEXT: SUB_INT * T1.W, T1.W, PS, 2050; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 2051; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, 2052; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, 2053; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, 2054; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, 2055; EG-NEXT: ADD_INT T3.W, PS, 1, 2056; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, 2057; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, 2058; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2059; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2060 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1 2061 %num = load i24, i24 addrspace(1) * %in 2062 %den = load i24, i24 addrspace(1) * %den_ptr 2063 %result = udiv i24 %num, %den 2064 %result.ext = zext i24 %result to i32 2065 store i32 %result.ext, i32 addrspace(1)* %out 2066 ret void 2067} 2068 2069define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) { 2070; SI-LABEL: scalarize_mulhu_4xi32: 2071; SI: ; %bb.0: 2072; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2073; SI-NEXT: s_mov_b32 s7, 0xf000 2074; SI-NEXT: s_mov_b32 s6, -1 2075; SI-NEXT: s_waitcnt lgkmcnt(0) 2076; SI-NEXT: s_mov_b32 s4, s0 2077; SI-NEXT: s_mov_b32 s5, s1 2078; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2079; SI-NEXT: s_mov_b32 s0, 0x1389c755 2080; SI-NEXT: s_mov_b32 s4, s2 2081; SI-NEXT: s_mov_b32 s5, s3 2082; SI-NEXT: s_waitcnt vmcnt(0) 2083; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2084; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2085; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2086; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2087; SI-NEXT: v_mul_hi_u32 v0, v0, s0 2088; SI-NEXT: v_mul_hi_u32 v1, v1, s0 2089; SI-NEXT: v_mul_hi_u32 v2, v2, s0 2090; SI-NEXT: v_mul_hi_u32 v3, v3, s0 2091; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2092; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2093; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2094; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2095; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2096; SI-NEXT: s_endpgm 2097; 2098; VI-LABEL: scalarize_mulhu_4xi32: 2099; VI: ; %bb.0: 2100; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2101; VI-NEXT: s_mov_b32 s7, 0xf000 2102; VI-NEXT: s_mov_b32 s6, -1 2103; VI-NEXT: s_waitcnt lgkmcnt(0) 2104; VI-NEXT: s_mov_b32 s4, s0 2105; VI-NEXT: s_mov_b32 s5, s1 2106; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2107; VI-NEXT: s_mov_b32 s0, 0x1389c755 2108; VI-NEXT: s_mov_b32 s4, s2 2109; VI-NEXT: s_mov_b32 s5, s3 2110; VI-NEXT: s_waitcnt vmcnt(0) 2111; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2112; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2113; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2114; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2115; VI-NEXT: v_mul_hi_u32 v0, v0, s0 2116; VI-NEXT: v_mul_hi_u32 v1, v1, s0 2117; VI-NEXT: v_mul_hi_u32 v2, v2, s0 2118; VI-NEXT: v_mul_hi_u32 v3, v3, s0 2119; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2120; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2121; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2122; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2123; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2124; VI-NEXT: s_endpgm 2125; 2126; GCN-LABEL: scalarize_mulhu_4xi32: 2127; GCN: ; %bb.0: 2128; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2129; GCN-NEXT: s_waitcnt lgkmcnt(0) 2130; GCN-NEXT: v_mov_b32_e32 v0, s0 2131; GCN-NEXT: v_mov_b32_e32 v1, s1 2132; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2133; GCN-NEXT: s_mov_b32 s0, 0x1389c755 2134; GCN-NEXT: v_mov_b32_e32 v4, s2 2135; GCN-NEXT: v_mov_b32_e32 v5, s3 2136; GCN-NEXT: s_waitcnt vmcnt(0) 2137; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2138; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2139; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2140; GCN-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2141; GCN-NEXT: v_mul_hi_u32 v0, v0, s0 2142; GCN-NEXT: v_mul_hi_u32 v1, v1, s0 2143; GCN-NEXT: v_mul_hi_u32 v2, v2, s0 2144; GCN-NEXT: v_mul_hi_u32 v3, v3, s0 2145; GCN-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2146; GCN-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2147; GCN-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2148; GCN-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2149; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2150; GCN-NEXT: s_endpgm 2151; 2152; GFX1030-LABEL: scalarize_mulhu_4xi32: 2153; GFX1030: ; %bb.0: 2154; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2155; GFX1030-NEXT: v_mov_b32_e32 v4, 0 2156; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2157; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] 2158; GFX1030-NEXT: s_mov_b32 s0, 0x1389c755 2159; GFX1030-NEXT: s_waitcnt vmcnt(0) 2160; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2161; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2162; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2163; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2164; GFX1030-NEXT: v_mul_hi_u32 v0, v0, s0 2165; GFX1030-NEXT: v_mul_hi_u32 v1, v1, s0 2166; GFX1030-NEXT: v_mul_hi_u32 v2, v2, s0 2167; GFX1030-NEXT: v_mul_hi_u32 v3, v3, s0 2168; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2169; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2170; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2171; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2172; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 2173; GFX1030-NEXT: s_endpgm 2174; 2175; EG-LABEL: scalarize_mulhu_4xi32: 2176; EG: ; %bb.0: 2177; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 2178; EG-NEXT: TEX 0 @6 2179; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] 2180; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2181; EG-NEXT: CF_END 2182; EG-NEXT: PAD 2183; EG-NEXT: Fetch clause starting at 6: 2184; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2185; EG-NEXT: ALU clause starting at 8: 2186; EG-NEXT: MOV * T0.X, KC0[2].Y, 2187; EG-NEXT: ALU clause starting at 9: 2188; EG-NEXT: LSHR T0.W, T0.W, literal.x, 2189; EG-NEXT: LSHR * T1.W, T0.Z, literal.x, 2190; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2191; EG-NEXT: MULHI * T0.Z, PV.W, literal.x, 2192; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2193; EG-NEXT: LSHR T1.Z, T0.Y, literal.x, 2194; EG-NEXT: LSHR T0.W, PS, literal.y, 2195; EG-NEXT: MULHI * T0.Y, T1.W, literal.z, 2196; EG-NEXT: 2(2.802597e-45), 10(1.401298e-44) 2197; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2198; EG-NEXT: LSHR T0.Z, PS, literal.x, 2199; EG-NEXT: LSHR T1.W, T0.X, literal.y, 2200; EG-NEXT: MULHI * T0.X, PV.Z, literal.z, 2201; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45) 2202; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2203; EG-NEXT: LSHR T0.Y, PS, literal.x, 2204; EG-NEXT: MULHI * T0.X, PV.W, literal.y, 2205; EG-NEXT: 10(1.401298e-44), 327796565(3.478022e-27) 2206; EG-NEXT: LSHR T0.X, PS, literal.x, 2207; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.y, 2208; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45) 2209 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 2210 %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668> 2211 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 2212 ret void 2213} 2214 2215define amdgpu_kernel void @test_udiv2(i32 %p) { 2216; SI-LABEL: test_udiv2: 2217; SI: ; %bb.0: 2218; SI-NEXT: s_load_dword s0, s[0:1], 0x9 2219; SI-NEXT: s_mov_b32 s3, 0xf000 2220; SI-NEXT: s_mov_b32 s2, -1 2221; SI-NEXT: s_waitcnt lgkmcnt(0) 2222; SI-NEXT: s_lshr_b32 s0, s0, 1 2223; SI-NEXT: v_mov_b32_e32 v0, s0 2224; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2225; SI-NEXT: s_waitcnt vmcnt(0) 2226; SI-NEXT: s_endpgm 2227; 2228; VI-LABEL: test_udiv2: 2229; VI: ; %bb.0: 2230; VI-NEXT: s_load_dword s0, s[0:1], 0x24 2231; VI-NEXT: s_mov_b32 s3, 0xf000 2232; VI-NEXT: s_mov_b32 s2, -1 2233; VI-NEXT: s_waitcnt lgkmcnt(0) 2234; VI-NEXT: s_lshr_b32 s0, s0, 1 2235; VI-NEXT: v_mov_b32_e32 v0, s0 2236; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2237; VI-NEXT: s_waitcnt vmcnt(0) 2238; VI-NEXT: s_endpgm 2239; 2240; GCN-LABEL: test_udiv2: 2241; GCN: ; %bb.0: 2242; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 2243; GCN-NEXT: s_waitcnt lgkmcnt(0) 2244; GCN-NEXT: s_lshr_b32 s0, s0, 1 2245; GCN-NEXT: v_mov_b32_e32 v0, s0 2246; GCN-NEXT: flat_store_dword v[0:1], v0 2247; GCN-NEXT: s_waitcnt vmcnt(0) 2248; GCN-NEXT: s_endpgm 2249; 2250; GFX1030-LABEL: test_udiv2: 2251; GFX1030: ; %bb.0: 2252; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 2253; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 2255; GFX1030-NEXT: v_mov_b32_e32 v0, s0 2256; GFX1030-NEXT: global_store_dword v[0:1], v0, off 2257; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 2258; GFX1030-NEXT: s_endpgm 2259; 2260; EG-LABEL: test_udiv2: 2261; EG: ; %bb.0: 2262; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2263; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2264; EG-NEXT: CF_END 2265; EG-NEXT: PAD 2266; EG-NEXT: ALU clause starting at 4: 2267; EG-NEXT: MOV T0.X, literal.x, 2268; EG-NEXT: LSHR * T1.X, KC0[2].Y, 1, 2269; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2270 %i = udiv i32 %p, 2 2271 store volatile i32 %i, i32 addrspace(1)* undef 2272 ret void 2273} 2274 2275define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { 2276; SI-LABEL: test_udiv_3_mulhu: 2277; SI: ; %bb.0: 2278; SI-NEXT: s_load_dword s0, s[0:1], 0x9 2279; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2280; SI-NEXT: s_mov_b32 s3, 0xf000 2281; SI-NEXT: s_mov_b32 s2, -1 2282; SI-NEXT: s_waitcnt lgkmcnt(0) 2283; SI-NEXT: v_mul_hi_u32 v0, s0, v0 2284; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2285; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2286; SI-NEXT: s_waitcnt vmcnt(0) 2287; SI-NEXT: s_endpgm 2288; 2289; VI-LABEL: test_udiv_3_mulhu: 2290; VI: ; %bb.0: 2291; VI-NEXT: s_load_dword s0, s[0:1], 0x24 2292; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2293; VI-NEXT: s_mov_b32 s3, 0xf000 2294; VI-NEXT: s_mov_b32 s2, -1 2295; VI-NEXT: s_waitcnt lgkmcnt(0) 2296; VI-NEXT: v_mul_hi_u32 v0, s0, v0 2297; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2298; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2299; VI-NEXT: s_waitcnt vmcnt(0) 2300; VI-NEXT: s_endpgm 2301; 2302; GCN-LABEL: test_udiv_3_mulhu: 2303; GCN: ; %bb.0: 2304; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 2305; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2306; GCN-NEXT: s_waitcnt lgkmcnt(0) 2307; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 2308; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2309; GCN-NEXT: flat_store_dword v[0:1], v0 2310; GCN-NEXT: s_waitcnt vmcnt(0) 2311; GCN-NEXT: s_endpgm 2312; 2313; GFX1030-LABEL: test_udiv_3_mulhu: 2314; GFX1030: ; %bb.0: 2315; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 2316; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab 2318; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 2319; GFX1030-NEXT: v_mov_b32_e32 v0, s0 2320; GFX1030-NEXT: global_store_dword v[0:1], v0, off 2321; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 2322; GFX1030-NEXT: s_endpgm 2323; 2324; EG-LABEL: test_udiv_3_mulhu: 2325; EG: ; %bb.0: 2326; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 2327; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2328; EG-NEXT: CF_END 2329; EG-NEXT: PAD 2330; EG-NEXT: ALU clause starting at 4: 2331; EG-NEXT: MULHI * T0.X, KC0[2].Y, literal.x, 2332; EG-NEXT: -1431655765(-3.031649e-13), 0(0.000000e+00) 2333; EG-NEXT: LSHR T0.X, PS, 1, 2334; EG-NEXT: MOV * T1.X, literal.x, 2335; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2336 %i = udiv i32 %p, 3 2337 store volatile i32 %i, i32 addrspace(1)* undef 2338 ret void 2339} 2340 2341define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) { 2342; SI-LABEL: fdiv_test_denormals: 2343; SI: ; %bb.0: ; %bb 2344; SI-NEXT: s_mov_b32 s0, 0 2345; SI-NEXT: s_mov_b32 s3, 0xf000 2346; SI-NEXT: s_mov_b32 s2, -1 2347; SI-NEXT: s_mov_b32 s1, s0 2348; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 2349; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 2350; SI-NEXT: s_waitcnt vmcnt(1) 2351; SI-NEXT: v_cvt_f32_i32_e32 v2, v0 2352; SI-NEXT: s_waitcnt vmcnt(0) 2353; SI-NEXT: v_cvt_f32_i32_e32 v3, v1 2354; SI-NEXT: v_xor_b32_e32 v0, v1, v0 2355; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 2356; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2 2357; SI-NEXT: v_or_b32_e32 v0, 1, v0 2358; SI-NEXT: v_mul_f32_e32 v1, v3, v4 2359; SI-NEXT: v_trunc_f32_e32 v1, v1 2360; SI-NEXT: v_mad_f32 v3, -v1, v2, v3 2361; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 2362; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 2363; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 2364; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2365; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2366; SI-NEXT: s_endpgm 2367; 2368; VI-LABEL: fdiv_test_denormals: 2369; VI: ; %bb.0: ; %bb 2370; VI-NEXT: s_mov_b32 s0, 0 2371; VI-NEXT: s_mov_b32 s3, 0xf000 2372; VI-NEXT: s_mov_b32 s2, -1 2373; VI-NEXT: s_mov_b32 s1, s0 2374; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 2375; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 2376; VI-NEXT: s_waitcnt vmcnt(1) 2377; VI-NEXT: v_cvt_f32_i32_e32 v2, v0 2378; VI-NEXT: s_waitcnt vmcnt(0) 2379; VI-NEXT: v_cvt_f32_i32_e32 v3, v1 2380; VI-NEXT: v_xor_b32_e32 v0, v1, v0 2381; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 2382; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 2383; VI-NEXT: v_or_b32_e32 v0, 1, v0 2384; VI-NEXT: v_mul_f32_e32 v1, v3, v4 2385; VI-NEXT: v_trunc_f32_e32 v1, v1 2386; VI-NEXT: v_mad_f32 v3, -v1, v2, v3 2387; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 2388; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 2389; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 2390; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 2391; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2392; VI-NEXT: s_endpgm 2393; 2394; GCN-LABEL: fdiv_test_denormals: 2395; GCN: ; %bb.0: ; %bb 2396; GCN-NEXT: flat_load_sbyte v2, v[0:1] 2397; GCN-NEXT: v_mov_b32_e32 v0, 0 2398; GCN-NEXT: v_mov_b32_e32 v1, 0 2399; GCN-NEXT: flat_load_sbyte v3, v[0:1] 2400; GCN-NEXT: s_waitcnt vmcnt(1) 2401; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2 2402; GCN-NEXT: s_waitcnt vmcnt(0) 2403; GCN-NEXT: v_cvt_f32_i32_e32 v5, v3 2404; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2405; GCN-NEXT: v_xor_b32_e32 v2, v3, v2 2406; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 2407; GCN-NEXT: v_or_b32_e32 v2, 1, v2 2408; GCN-NEXT: v_mul_f32_e32 v3, v5, v6 2409; GCN-NEXT: v_trunc_f32_e32 v3, v3 2410; GCN-NEXT: v_mad_f32 v5, -v3, v4, v5 2411; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2412; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 2413; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 2414; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 2415; GCN-NEXT: flat_store_byte v[0:1], v2 2416; GCN-NEXT: s_endpgm 2417; 2418; GFX1030-LABEL: fdiv_test_denormals: 2419; GFX1030: ; %bb.0: ; %bb 2420; GFX1030-NEXT: global_load_sbyte v2, v[0:1], off 2421; GFX1030-NEXT: v_mov_b32_e32 v0, 0 2422; GFX1030-NEXT: v_mov_b32_e32 v1, 0 2423; GFX1030-NEXT: global_load_sbyte v3, v[0:1], off 2424; GFX1030-NEXT: s_waitcnt vmcnt(1) 2425; GFX1030-NEXT: v_cvt_f32_i32_e32 v4, v2 2426; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v4 2427; GFX1030-NEXT: s_waitcnt vmcnt(0) 2428; GFX1030-NEXT: v_cvt_f32_i32_e32 v6, v3 2429; GFX1030-NEXT: v_xor_b32_e32 v2, v3, v2 2430; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 30, v2 2431; GFX1030-NEXT: v_mul_f32_e32 v5, v6, v5 2432; GFX1030-NEXT: v_or_b32_e32 v2, 1, v2 2433; GFX1030-NEXT: v_trunc_f32_e32 v3, v5 2434; GFX1030-NEXT: v_fma_f32 v5, -v3, v4, v6 2435; GFX1030-NEXT: v_cvt_i32_f32_e32 v3, v3 2436; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4| 2437; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo 2438; GFX1030-NEXT: v_add_nc_u32_e32 v2, v3, v2 2439; GFX1030-NEXT: global_store_byte v[0:1], v2, off 2440; GFX1030-NEXT: s_endpgm 2441; 2442; EG-LABEL: fdiv_test_denormals: 2443; EG: ; %bb.0: ; %bb 2444; EG-NEXT: TEX 0 @6 2445; EG-NEXT: ALU 0, @10, KC0[], KC1[] 2446; EG-NEXT: TEX 0 @8 2447; EG-NEXT: ALU 25, @11, KC0[], KC1[] 2448; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 2449; EG-NEXT: CF_END 2450; EG-NEXT: Fetch clause starting at 6: 2451; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 2452; EG-NEXT: Fetch clause starting at 8: 2453; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1 2454; EG-NEXT: ALU clause starting at 10: 2455; EG-NEXT: MOV * T1.X, 0.0, 2456; EG-NEXT: ALU clause starting at 11: 2457; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, 2458; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2459; EG-NEXT: INT_TO_FLT * T0.X, PV.W, 2460; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, 2461; EG-NEXT: RECIP_IEEE * T0.Y, PS, 2462; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2463; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 2464; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y, 2465; EG-NEXT: TRUNC T2.W, PV.W, 2466; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 2467; EG-NEXT: ASHR T0.W, PS, literal.x, 2468; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z, 2469; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 2470; EG-NEXT: TRUNC T0.Z, T2.W, 2471; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 2472; EG-NEXT: OR_INT * T0.W, PV.W, 1, 2473; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 2474; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 2475; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 2476; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 2477; EG-NEXT: MOV * T0.W, literal.x, 2478; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2479; EG-NEXT: MOV T0.Y, 0.0, 2480; EG-NEXT: MOV * T0.Z, 0.0, 2481; EG-NEXT: MOV * T1.X, literal.x, 2482; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2483bb: 2484 %tmp = load i8, i8 addrspace(1)* null, align 1 2485 %tmp1 = sext i8 %tmp to i32 2486 %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef 2487 %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 2488 %tmp4 = sext i8 %tmp3 to i32 2489 %tmp5 = sdiv i32 %tmp1, %tmp4 2490 %tmp6 = trunc i32 %tmp5 to i8 2491 store i8 %tmp6, i8 addrspace(1)* null, align 1 2492 ret void 2493} 2494 2495define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { 2496; SI-LABEL: v_test_udiv64_mulhi_fold: 2497; SI: ; %bb.0: 2498; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2499; SI-NEXT: v_mov_b32_e32 v2, 0x4f800000 2500; SI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 2501; SI-NEXT: v_rcp_f32_e32 v2, v2 2502; SI-NEXT: s_mov_b32 s4, 0xfffe7960 2503; SI-NEXT: v_mov_b32_e32 v9, 0 2504; SI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 2505; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 2506; SI-NEXT: v_trunc_f32_e32 v3, v3 2507; SI-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 2508; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 2509; SI-NEXT: v_cvt_u32_f32_e32 v3, v3 2510; SI-NEXT: v_mul_hi_u32 v4, v2, s4 2511; SI-NEXT: v_mul_lo_u32 v5, v3, s4 2512; SI-NEXT: v_mul_lo_u32 v6, v2, s4 2513; SI-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 2514; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2515; SI-NEXT: v_mul_lo_u32 v5, v2, v4 2516; SI-NEXT: v_mul_hi_u32 v7, v2, v6 2517; SI-NEXT: v_mul_hi_u32 v8, v2, v4 2518; SI-NEXT: v_mul_hi_u32 v10, v3, v4 2519; SI-NEXT: v_mul_lo_u32 v4, v3, v4 2520; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 2521; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 2522; SI-NEXT: v_mul_lo_u32 v8, v3, v6 2523; SI-NEXT: v_mul_hi_u32 v6, v3, v6 2524; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 2525; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc 2526; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc 2527; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2528; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 2529; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 2530; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2531; SI-NEXT: v_mul_hi_u32 v4, v2, s4 2532; SI-NEXT: v_mul_lo_u32 v5, v3, s4 2533; SI-NEXT: v_mul_lo_u32 v6, v2, s4 2534; SI-NEXT: s_mov_b32 s4, 0x186a0 2535; SI-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 2536; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5 2537; SI-NEXT: v_mul_lo_u32 v5, v2, v4 2538; SI-NEXT: v_mul_hi_u32 v7, v2, v6 2539; SI-NEXT: v_mul_hi_u32 v8, v2, v4 2540; SI-NEXT: v_mul_hi_u32 v10, v3, v4 2541; SI-NEXT: v_mul_lo_u32 v4, v3, v4 2542; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 2543; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 2544; SI-NEXT: v_mul_lo_u32 v8, v3, v6 2545; SI-NEXT: v_mul_hi_u32 v6, v3, v6 2546; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 2547; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc 2548; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc 2549; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2550; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 2551; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 2552; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2553; SI-NEXT: v_mul_lo_u32 v4, v0, v3 2554; SI-NEXT: v_mul_hi_u32 v5, v0, v2 2555; SI-NEXT: v_mul_hi_u32 v6, v0, v3 2556; SI-NEXT: v_mul_hi_u32 v7, v1, v3 2557; SI-NEXT: v_mul_lo_u32 v3, v1, v3 2558; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2559; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 2560; SI-NEXT: v_mul_lo_u32 v6, v1, v2 2561; SI-NEXT: v_mul_hi_u32 v2, v1, v2 2562; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 2563; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 2564; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc 2565; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2566; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2567; SI-NEXT: v_mul_lo_u32 v4, v3, s4 2568; SI-NEXT: v_mul_hi_u32 v5, v2, s4 2569; SI-NEXT: v_mul_lo_u32 v6, v2, s4 2570; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2571; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 2572; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc 2573; SI-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 2574; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc 2575; SI-NEXT: s_mov_b32 s4, 0x1869f 2576; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 2577; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 2578; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 2579; SI-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 2580; SI-NEXT: v_add_i32_e32 v5, vcc, 2, v2 2581; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc 2582; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v2 2583; SI-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 2584; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc 2585; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 2586; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2587; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 2588; SI-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5] 2589; SI-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc 2590; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 2591; SI-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc 2592; SI-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5] 2593; SI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] 2594; SI-NEXT: s_setpc_b64 s[30:31] 2595; 2596; VI-LABEL: v_test_udiv64_mulhi_fold: 2597; VI: ; %bb.0: 2598; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2599; VI-NEXT: v_mov_b32_e32 v2, 0x4f800000 2600; VI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 2601; VI-NEXT: v_rcp_f32_e32 v2, v2 2602; VI-NEXT: s_mov_b32 s6, 0xfffe7960 2603; VI-NEXT: v_mov_b32_e32 v9, 0 2604; VI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 2605; VI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 2606; VI-NEXT: v_trunc_f32_e32 v3, v3 2607; VI-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 2608; VI-NEXT: v_cvt_u32_f32_e32 v6, v2 2609; VI-NEXT: v_cvt_u32_f32_e32 v7, v3 2610; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 2611; VI-NEXT: v_mul_lo_u32 v4, v7, s6 2612; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 2613; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v3 2614; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 2615; VI-NEXT: v_mul_hi_u32 v8, v6, v2 2616; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 2617; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 2618; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc 2619; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 2620; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2621; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc 2622; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2623; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2624; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2625; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 2626; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc 2627; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 2628; VI-NEXT: v_mul_lo_u32 v4, v7, s6 2629; VI-NEXT: s_mov_b32 s6, 0x186a0 2630; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 2631; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 2632; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 2633; VI-NEXT: v_mul_hi_u32 v8, v6, v2 2634; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 2635; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 2636; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc 2637; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 2638; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2639; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc 2640; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2641; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2642; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2643; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 2644; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc 2645; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0 2646; VI-NEXT: v_mul_hi_u32 v6, v0, v4 2647; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 2648; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc 2649; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0 2650; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 2651; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2 2652; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc 2653; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2654; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4 2655; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc 2656; VI-NEXT: v_mul_lo_u32 v6, v5, s6 2657; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 2658; VI-NEXT: s_mov_b32 s4, 0x1869f 2659; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6 2660; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 2661; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2662; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 2663; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 2664; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 2665; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc 2666; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2667; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc 2668; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v4 2669; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc 2670; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4 2671; VI-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 2672; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc 2673; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 2674; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2675; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 2676; VI-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5] 2677; VI-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 2678; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 2679; VI-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc 2680; VI-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5] 2681; VI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 2682; VI-NEXT: s_setpc_b64 s[30:31] 2683; 2684; GCN-LABEL: v_test_udiv64_mulhi_fold: 2685; GCN: ; %bb.0: 2686; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2687; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 2688; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 2689; GCN-NEXT: v_rcp_f32_e32 v2, v2 2690; GCN-NEXT: s_mov_b32 s6, 0xfffe7960 2691; GCN-NEXT: v_mov_b32_e32 v9, 0 2692; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 2693; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 2694; GCN-NEXT: v_trunc_f32_e32 v3, v3 2695; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 2696; GCN-NEXT: v_cvt_u32_f32_e32 v6, v2 2697; GCN-NEXT: v_cvt_u32_f32_e32 v7, v3 2698; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 2699; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 2700; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 2701; GCN-NEXT: v_add_u32_e32 v5, vcc, v4, v3 2702; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 2703; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 2704; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 2705; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 2706; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc 2707; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 2708; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2709; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc 2710; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2711; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2712; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2713; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2 2714; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc 2715; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 2716; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 2717; GCN-NEXT: s_mov_b32 s6, 0x186a0 2718; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 2719; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 2720; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 2721; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 2722; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 2723; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 2724; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc 2725; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 2726; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 2727; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc 2728; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2729; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 2730; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2731; GCN-NEXT: v_add_u32_e32 v4, vcc, v6, v2 2732; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc 2733; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v5, 0 2734; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 2735; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2 2736; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc 2737; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v4, 0 2738; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 2739; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2 2740; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc 2741; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc 2742; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4 2743; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc 2744; GCN-NEXT: v_mul_lo_u32 v6, v5, s6 2745; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 2746; GCN-NEXT: s_mov_b32 s4, 0x1869f 2747; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6 2748; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 2749; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2750; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 2751; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 2752; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 2753; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc 2754; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2755; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc 2756; GCN-NEXT: v_add_u32_e32 v3, vcc, 2, v4 2757; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc 2758; GCN-NEXT: v_add_u32_e32 v7, vcc, 1, v4 2759; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0 2760; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc 2761; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 2762; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 2763; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 2764; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5] 2765; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 2766; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 2767; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc 2768; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5] 2769; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 2770; GCN-NEXT: s_setpc_b64 s[30:31] 2771; 2772; GFX1030-LABEL: v_test_udiv64_mulhi_fold: 2773; GFX1030: ; %bb.0: 2774; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2775; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 2776; GFX1030-NEXT: s_mov_b32 s4, 0x346d900 2777; GFX1030-NEXT: s_mov_b32 s5, 0xfffe7960 2778; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4 2779; GFX1030-NEXT: s_addc_u32 s6, 0, 0 2780; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4 2781; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 2782; GFX1030-NEXT: s_addc_u32 s4, s6, 0xa7c5 2783; GFX1030-NEXT: v_mul_hi_u32 v3, v2, s5 2784; GFX1030-NEXT: v_mul_lo_u32 v4, v2, s5 2785; GFX1030-NEXT: s_mul_i32 s5, s4, s5 2786; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v2 2787; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v4 2788; GFX1030-NEXT: v_mul_hi_u32 v8, s4, v4 2789; GFX1030-NEXT: v_mul_lo_u32 v4, s4, v4 2790; GFX1030-NEXT: v_add_nc_u32_e32 v3, s5, v3 2791; GFX1030-NEXT: v_mul_lo_u32 v6, v2, v3 2792; GFX1030-NEXT: v_mul_hi_u32 v7, v2, v3 2793; GFX1030-NEXT: v_mul_hi_u32 v9, s4, v3 2794; GFX1030-NEXT: v_mul_lo_u32 v3, s4, v3 2795; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v5, v6 2796; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo 2797; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v5, v4 2798; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo 2799; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo 2800; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3 2801; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo 2802; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v2, v3 2803; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo 2804; GFX1030-NEXT: v_mul_hi_u32 v8, v0, v5 2805; GFX1030-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v5, 0 2806; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, v0, v6, 0 2807; GFX1030-NEXT: v_mad_u64_u32 v[6:7], s4, v1, v6, 0 2808; GFX1030-NEXT: s_mov_b32 s4, 0x186a0 2809; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 2810; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2811; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2812; GFX1030-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo 2813; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo 2814; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 2815; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo 2816; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s5, v4, s4, 0 2817; GFX1030-NEXT: v_mul_lo_u32 v6, v5, s4 2818; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 2819; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v6 2820; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 2821; GFX1030-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 2822; GFX1030-NEXT: s_mov_b32 s4, 0x1869f 2823; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 2824; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v2 2825; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo 2826; GFX1030-NEXT: v_add_co_u32 v6, vcc_lo, v4, 2 2827; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo 2828; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v0 2829; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 2830; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2831; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2832; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4 2833; GFX1030-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo 2834; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v4, 1 2835; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v5, vcc_lo 2836; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 2837; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 2838; GFX1030-NEXT: v_cndmask_b32_e32 v2, v8, v7, vcc_lo 2839; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 2840; GFX1030-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo 2841; GFX1030-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 2842; GFX1030-NEXT: s_setpc_b64 s[30:31] 2843; 2844; EG-LABEL: v_test_udiv64_mulhi_fold: 2845; EG: ; %bb.0: 2846; EG-NEXT: CF_END 2847; EG-NEXT: PAD 2848 %d = udiv i64 %arg, 100000 2849 ret i64 %d 2850} 2851