1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 8 9define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 10; SI-LABEL: frem_f16: 11; SI: ; %bb.0: 12; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 14; SI-NEXT: s_mov_b32 s11, 0xf000 15; SI-NEXT: s_mov_b32 s10, -1 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s4 18; SI-NEXT: s_mov_b32 s9, s5 19; SI-NEXT: s_mov_b32 s4, s6 20; SI-NEXT: s_mov_b32 s5, s7 21; SI-NEXT: s_mov_b32 s6, s10 22; SI-NEXT: s_mov_b32 s7, s11 23; SI-NEXT: s_mov_b32 s2, s10 24; SI-NEXT: s_mov_b32 s3, s11 25; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 28; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 29; SI-NEXT: s_waitcnt vmcnt(0) 30; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 31; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 32; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 33; SI-NEXT: v_rcp_f32_e32 v4, v3 34; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 35; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 36; SI-NEXT: v_fma_f32 v4, v5, v4, v4 37; SI-NEXT: v_mul_f32_e32 v5, v2, v4 38; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 39; SI-NEXT: v_fma_f32 v5, v6, v4, v5 40; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 41; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 42; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 43; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 44; SI-NEXT: v_trunc_f32_e32 v2, v2 45; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 46; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 47; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 48; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 49; SI-NEXT: s_endpgm 50; 51; CI-LABEL: frem_f16: 52; CI: ; %bb.0: 53; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 54; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 55; CI-NEXT: s_mov_b32 s11, 0xf000 56; CI-NEXT: s_mov_b32 s10, -1 57; CI-NEXT: s_mov_b32 s2, s10 58; CI-NEXT: s_waitcnt lgkmcnt(0) 59; CI-NEXT: s_mov_b32 s8, s4 60; CI-NEXT: s_mov_b32 s9, s5 61; CI-NEXT: s_mov_b32 s4, s6 62; CI-NEXT: s_mov_b32 s5, s7 63; CI-NEXT: s_mov_b32 s6, s10 64; CI-NEXT: s_mov_b32 s7, s11 65; CI-NEXT: s_mov_b32 s3, s11 66; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 67; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 68; CI-NEXT: s_waitcnt vmcnt(1) 69; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 70; CI-NEXT: s_waitcnt vmcnt(0) 71; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 72; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 73; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 74; CI-NEXT: v_rcp_f32_e32 v4, v3 75; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 76; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 77; CI-NEXT: v_fma_f32 v4, v5, v4, v4 78; CI-NEXT: v_mul_f32_e32 v5, v2, v4 79; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 80; CI-NEXT: v_fma_f32 v5, v6, v4, v5 81; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 82; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 83; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 84; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 85; CI-NEXT: v_trunc_f32_e32 v2, v2 86; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 87; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 88; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 89; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 90; CI-NEXT: s_endpgm 91; 92; VI-LABEL: frem_f16: 93; VI: ; %bb.0: 94; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 95; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: v_mov_b32_e32 v2, s6 98; VI-NEXT: s_add_u32 s0, s0, 8 99; VI-NEXT: v_mov_b32_e32 v3, s7 100; VI-NEXT: s_addc_u32 s1, s1, 0 101; VI-NEXT: flat_load_ushort v4, v[2:3] 102; VI-NEXT: v_mov_b32_e32 v3, s1 103; VI-NEXT: v_mov_b32_e32 v2, s0 104; VI-NEXT: flat_load_ushort v2, v[2:3] 105; VI-NEXT: v_mov_b32_e32 v0, s4 106; VI-NEXT: v_mov_b32_e32 v1, s5 107; VI-NEXT: s_waitcnt vmcnt(1) 108; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 111; VI-NEXT: v_rcp_f32_e32 v5, v5 112; VI-NEXT: v_mul_f32_e32 v3, v3, v5 113; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 114; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 115; VI-NEXT: v_trunc_f16_e32 v3, v3 116; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 117; VI-NEXT: flat_store_short v[0:1], v2 118; VI-NEXT: s_endpgm 119; 120; GFX9-LABEL: frem_f16: 121; GFX9: ; %bb.0: 122; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 123; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 124; GFX9-NEXT: v_mov_b32_e32 v0, 0 125; GFX9-NEXT: s_waitcnt lgkmcnt(0) 126; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 127; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 128; GFX9-NEXT: s_waitcnt vmcnt(1) 129; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 130; GFX9-NEXT: s_waitcnt vmcnt(0) 131; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 132; GFX9-NEXT: v_rcp_f32_e32 v4, v4 133; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 134; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 135; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 136; GFX9-NEXT: v_trunc_f16_e32 v3, v3 137; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 138; GFX9-NEXT: global_store_short v0, v1, s[4:5] 139; GFX9-NEXT: s_endpgm 140; 141; GFX10-LABEL: frem_f16: 142; GFX10: ; %bb.0: 143; GFX10-NEXT: s_clause 0x1 144; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 145; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 146; GFX10-NEXT: v_mov_b32_e32 v0, 0 147; GFX10-NEXT: s_waitcnt lgkmcnt(0) 148; GFX10-NEXT: s_clause 0x1 149; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 150; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 151; GFX10-NEXT: s_waitcnt vmcnt(1) 152; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 153; GFX10-NEXT: s_waitcnt vmcnt(0) 154; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 155; GFX10-NEXT: v_rcp_f32_e32 v4, v4 156; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 157; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 158; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 159; GFX10-NEXT: v_trunc_f16_e32 v3, v3 160; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 161; GFX10-NEXT: global_store_short v0, v1, s[4:5] 162; GFX10-NEXT: s_endpgm 163; 164; GFX11-LABEL: frem_f16: 165; GFX11: ; %bb.0: 166; GFX11-NEXT: s_clause 0x1 167; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 168; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 169; GFX11-NEXT: v_mov_b32_e32 v0, 0 170; GFX11-NEXT: s_waitcnt lgkmcnt(0) 171; GFX11-NEXT: s_clause 0x1 172; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] 173; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 174; GFX11-NEXT: s_waitcnt vmcnt(1) 175; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 176; GFX11-NEXT: s_waitcnt vmcnt(0) 177; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 179; GFX11-NEXT: v_rcp_f32_e32 v4, v4 180; GFX11-NEXT: s_waitcnt_depctr 0xfff 181; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 182; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 183; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 184; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 185; GFX11-NEXT: v_trunc_f16_e32 v3, v3 186; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 187; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 188; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] 189; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 190; GFX11-NEXT: s_endpgm 191 half addrspace(1)* %in2) #0 { 192 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 193 %r0 = load half, half addrspace(1)* %in1, align 4 194 %r1 = load half, half addrspace(1)* %gep2, align 4 195 %r2 = frem half %r0, %r1 196 store half %r2, half addrspace(1)* %out, align 4 197 ret void 198} 199 200define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 201; SI-LABEL: fast_frem_f16: 202; SI: ; %bb.0: 203; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 204; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 205; SI-NEXT: s_mov_b32 s11, 0xf000 206; SI-NEXT: s_mov_b32 s10, -1 207; SI-NEXT: s_waitcnt lgkmcnt(0) 208; SI-NEXT: s_mov_b32 s8, s4 209; SI-NEXT: s_mov_b32 s9, s5 210; SI-NEXT: s_mov_b32 s4, s6 211; SI-NEXT: s_mov_b32 s5, s7 212; SI-NEXT: s_mov_b32 s6, s10 213; SI-NEXT: s_mov_b32 s7, s11 214; SI-NEXT: s_mov_b32 s2, s10 215; SI-NEXT: s_mov_b32 s3, s11 216; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 217; SI-NEXT: s_waitcnt vmcnt(0) 218; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 219; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 220; SI-NEXT: s_waitcnt vmcnt(0) 221; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 222; SI-NEXT: v_rcp_f32_e32 v2, v1 223; SI-NEXT: v_mul_f32_e32 v2, v0, v2 224; SI-NEXT: v_trunc_f32_e32 v2, v2 225; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 226; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 227; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 228; SI-NEXT: s_endpgm 229; 230; CI-LABEL: fast_frem_f16: 231; CI: ; %bb.0: 232; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 233; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 234; CI-NEXT: s_mov_b32 s11, 0xf000 235; CI-NEXT: s_mov_b32 s10, -1 236; CI-NEXT: s_mov_b32 s2, s10 237; CI-NEXT: s_mov_b32 s3, s11 238; CI-NEXT: s_waitcnt lgkmcnt(0) 239; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 240; CI-NEXT: s_mov_b32 s8, s4 241; CI-NEXT: s_mov_b32 s9, s5 242; CI-NEXT: s_mov_b32 s4, s6 243; CI-NEXT: s_mov_b32 s5, s7 244; CI-NEXT: s_mov_b32 s6, s10 245; CI-NEXT: s_mov_b32 s7, s11 246; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 247; CI-NEXT: s_waitcnt vmcnt(1) 248; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 249; CI-NEXT: v_rcp_f32_e32 v2, v1 250; CI-NEXT: s_waitcnt vmcnt(0) 251; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 252; CI-NEXT: v_mul_f32_e32 v2, v0, v2 253; CI-NEXT: v_trunc_f32_e32 v2, v2 254; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 255; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 256; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 257; CI-NEXT: s_endpgm 258; 259; VI-LABEL: fast_frem_f16: 260; VI: ; %bb.0: 261; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 262; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 263; VI-NEXT: s_waitcnt lgkmcnt(0) 264; VI-NEXT: v_mov_b32_e32 v2, s6 265; VI-NEXT: s_add_u32 s0, s0, 8 266; VI-NEXT: v_mov_b32_e32 v3, s7 267; VI-NEXT: s_addc_u32 s1, s1, 0 268; VI-NEXT: flat_load_ushort v4, v[2:3] 269; VI-NEXT: v_mov_b32_e32 v3, s1 270; VI-NEXT: v_mov_b32_e32 v2, s0 271; VI-NEXT: flat_load_ushort v2, v[2:3] 272; VI-NEXT: v_mov_b32_e32 v0, s4 273; VI-NEXT: v_mov_b32_e32 v1, s5 274; VI-NEXT: s_waitcnt vmcnt(0) 275; VI-NEXT: v_rcp_f16_e32 v3, v2 276; VI-NEXT: v_mul_f16_e32 v3, v4, v3 277; VI-NEXT: v_trunc_f16_e32 v3, v3 278; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 279; VI-NEXT: flat_store_short v[0:1], v2 280; VI-NEXT: s_endpgm 281; 282; GFX9-LABEL: fast_frem_f16: 283; GFX9: ; %bb.0: 284; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 285; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 286; GFX9-NEXT: v_mov_b32_e32 v0, 0 287; GFX9-NEXT: s_waitcnt lgkmcnt(0) 288; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 289; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 290; GFX9-NEXT: s_waitcnt vmcnt(0) 291; GFX9-NEXT: v_rcp_f16_e32 v3, v2 292; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 293; GFX9-NEXT: v_trunc_f16_e32 v3, v3 294; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 295; GFX9-NEXT: global_store_short v0, v1, s[4:5] 296; GFX9-NEXT: s_endpgm 297; 298; GFX10-LABEL: fast_frem_f16: 299; GFX10: ; %bb.0: 300; GFX10-NEXT: s_clause 0x1 301; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 302; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 303; GFX10-NEXT: v_mov_b32_e32 v0, 0 304; GFX10-NEXT: s_waitcnt lgkmcnt(0) 305; GFX10-NEXT: s_clause 0x1 306; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 307; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 308; GFX10-NEXT: s_waitcnt vmcnt(0) 309; GFX10-NEXT: v_rcp_f16_e32 v3, v2 310; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 311; GFX10-NEXT: v_trunc_f16_e32 v3, v3 312; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 313; GFX10-NEXT: global_store_short v0, v1, s[4:5] 314; GFX10-NEXT: s_endpgm 315; 316; GFX11-LABEL: fast_frem_f16: 317; GFX11: ; %bb.0: 318; GFX11-NEXT: s_clause 0x1 319; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 320; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 321; GFX11-NEXT: v_mov_b32_e32 v0, 0 322; GFX11-NEXT: s_waitcnt lgkmcnt(0) 323; GFX11-NEXT: s_clause 0x1 324; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] 325; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 326; GFX11-NEXT: s_waitcnt vmcnt(0) 327; GFX11-NEXT: v_rcp_f16_e32 v3, v2 328; GFX11-NEXT: s_waitcnt_depctr 0xfff 329; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 330; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 331; GFX11-NEXT: v_trunc_f16_e32 v3, v3 332; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 333; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] 334; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 335; GFX11-NEXT: s_endpgm 336 half addrspace(1)* %in2) #0 { 337 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 338 %r0 = load half, half addrspace(1)* %in1, align 4 339 %r1 = load half, half addrspace(1)* %gep2, align 4 340 %r2 = frem fast half %r0, %r1 341 store half %r2, half addrspace(1)* %out, align 4 342 ret void 343} 344 345define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 346; SI-LABEL: unsafe_frem_f16: 347; SI: ; %bb.0: 348; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 349; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 350; SI-NEXT: s_mov_b32 s11, 0xf000 351; SI-NEXT: s_mov_b32 s10, -1 352; SI-NEXT: s_waitcnt lgkmcnt(0) 353; SI-NEXT: s_mov_b32 s8, s4 354; SI-NEXT: s_mov_b32 s9, s5 355; SI-NEXT: s_mov_b32 s4, s6 356; SI-NEXT: s_mov_b32 s5, s7 357; SI-NEXT: s_mov_b32 s6, s10 358; SI-NEXT: s_mov_b32 s7, s11 359; SI-NEXT: s_mov_b32 s2, s10 360; SI-NEXT: s_mov_b32 s3, s11 361; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 362; SI-NEXT: s_waitcnt vmcnt(0) 363; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 364; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 365; SI-NEXT: s_waitcnt vmcnt(0) 366; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 367; SI-NEXT: v_rcp_f32_e32 v2, v1 368; SI-NEXT: v_mul_f32_e32 v2, v0, v2 369; SI-NEXT: v_trunc_f32_e32 v2, v2 370; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 371; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 372; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 373; SI-NEXT: s_endpgm 374; 375; CI-LABEL: unsafe_frem_f16: 376; CI: ; %bb.0: 377; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 378; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 379; CI-NEXT: s_mov_b32 s11, 0xf000 380; CI-NEXT: s_mov_b32 s10, -1 381; CI-NEXT: s_mov_b32 s2, s10 382; CI-NEXT: s_mov_b32 s3, s11 383; CI-NEXT: s_waitcnt lgkmcnt(0) 384; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 385; CI-NEXT: s_mov_b32 s8, s4 386; CI-NEXT: s_mov_b32 s9, s5 387; CI-NEXT: s_mov_b32 s4, s6 388; CI-NEXT: s_mov_b32 s5, s7 389; CI-NEXT: s_mov_b32 s6, s10 390; CI-NEXT: s_mov_b32 s7, s11 391; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 392; CI-NEXT: s_waitcnt vmcnt(1) 393; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 394; CI-NEXT: v_rcp_f32_e32 v2, v1 395; CI-NEXT: s_waitcnt vmcnt(0) 396; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 397; CI-NEXT: v_mul_f32_e32 v2, v0, v2 398; CI-NEXT: v_trunc_f32_e32 v2, v2 399; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 400; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 401; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 402; CI-NEXT: s_endpgm 403; 404; VI-LABEL: unsafe_frem_f16: 405; VI: ; %bb.0: 406; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 407; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 408; VI-NEXT: s_waitcnt lgkmcnt(0) 409; VI-NEXT: v_mov_b32_e32 v2, s6 410; VI-NEXT: s_add_u32 s0, s0, 8 411; VI-NEXT: v_mov_b32_e32 v3, s7 412; VI-NEXT: s_addc_u32 s1, s1, 0 413; VI-NEXT: flat_load_ushort v4, v[2:3] 414; VI-NEXT: v_mov_b32_e32 v3, s1 415; VI-NEXT: v_mov_b32_e32 v2, s0 416; VI-NEXT: flat_load_ushort v2, v[2:3] 417; VI-NEXT: v_mov_b32_e32 v0, s4 418; VI-NEXT: v_mov_b32_e32 v1, s5 419; VI-NEXT: s_waitcnt vmcnt(0) 420; VI-NEXT: v_rcp_f16_e32 v3, v2 421; VI-NEXT: v_mul_f16_e32 v3, v4, v3 422; VI-NEXT: v_trunc_f16_e32 v3, v3 423; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 424; VI-NEXT: flat_store_short v[0:1], v2 425; VI-NEXT: s_endpgm 426; 427; GFX9-LABEL: unsafe_frem_f16: 428; GFX9: ; %bb.0: 429; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 430; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 431; GFX9-NEXT: v_mov_b32_e32 v0, 0 432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 433; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 434; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 435; GFX9-NEXT: s_waitcnt vmcnt(0) 436; GFX9-NEXT: v_rcp_f16_e32 v3, v2 437; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 438; GFX9-NEXT: v_trunc_f16_e32 v3, v3 439; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 440; GFX9-NEXT: global_store_short v0, v1, s[4:5] 441; GFX9-NEXT: s_endpgm 442; 443; GFX10-LABEL: unsafe_frem_f16: 444; GFX10: ; %bb.0: 445; GFX10-NEXT: s_clause 0x1 446; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 447; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 448; GFX10-NEXT: v_mov_b32_e32 v0, 0 449; GFX10-NEXT: s_waitcnt lgkmcnt(0) 450; GFX10-NEXT: s_clause 0x1 451; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 452; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 453; GFX10-NEXT: s_waitcnt vmcnt(0) 454; GFX10-NEXT: v_rcp_f16_e32 v3, v2 455; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 456; GFX10-NEXT: v_trunc_f16_e32 v3, v3 457; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 458; GFX10-NEXT: global_store_short v0, v1, s[4:5] 459; GFX10-NEXT: s_endpgm 460; 461; GFX11-LABEL: unsafe_frem_f16: 462; GFX11: ; %bb.0: 463; GFX11-NEXT: s_clause 0x1 464; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 465; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 466; GFX11-NEXT: v_mov_b32_e32 v0, 0 467; GFX11-NEXT: s_waitcnt lgkmcnt(0) 468; GFX11-NEXT: s_clause 0x1 469; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] 470; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 471; GFX11-NEXT: s_waitcnt vmcnt(0) 472; GFX11-NEXT: v_rcp_f16_e32 v3, v2 473; GFX11-NEXT: s_waitcnt_depctr 0xfff 474; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 475; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 476; GFX11-NEXT: v_trunc_f16_e32 v3, v3 477; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 478; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] 479; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 480; GFX11-NEXT: s_endpgm 481 half addrspace(1)* %in2) #1 { 482 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 483 %r0 = load half, half addrspace(1)* %in1, align 4 484 %r1 = load half, half addrspace(1)* %gep2, align 4 485 %r2 = frem afn half %r0, %r1 486 store half %r2, half addrspace(1)* %out, align 4 487 ret void 488} 489 490define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 491; SI-LABEL: frem_f32: 492; SI: ; %bb.0: 493; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 494; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 495; SI-NEXT: s_mov_b32 s11, 0xf000 496; SI-NEXT: s_mov_b32 s10, -1 497; SI-NEXT: s_waitcnt lgkmcnt(0) 498; SI-NEXT: s_mov_b32 s8, s4 499; SI-NEXT: s_mov_b32 s9, s5 500; SI-NEXT: s_mov_b32 s4, s6 501; SI-NEXT: s_mov_b32 s5, s7 502; SI-NEXT: s_mov_b32 s6, s10 503; SI-NEXT: s_mov_b32 s7, s11 504; SI-NEXT: s_mov_b32 s2, s10 505; SI-NEXT: s_mov_b32 s3, s11 506; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 507; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 508; SI-NEXT: s_waitcnt vmcnt(0) 509; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 510; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 511; SI-NEXT: v_rcp_f32_e32 v4, v3 512; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 513; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 514; SI-NEXT: v_fma_f32 v4, v5, v4, v4 515; SI-NEXT: v_mul_f32_e32 v5, v2, v4 516; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 517; SI-NEXT: v_fma_f32 v5, v6, v4, v5 518; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 519; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 520; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 521; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 522; SI-NEXT: v_trunc_f32_e32 v2, v2 523; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 524; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 525; SI-NEXT: s_endpgm 526; 527; CI-LABEL: frem_f32: 528; CI: ; %bb.0: 529; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 530; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 531; CI-NEXT: s_mov_b32 s11, 0xf000 532; CI-NEXT: s_mov_b32 s10, -1 533; CI-NEXT: s_mov_b32 s2, s10 534; CI-NEXT: s_waitcnt lgkmcnt(0) 535; CI-NEXT: s_mov_b32 s8, s4 536; CI-NEXT: s_mov_b32 s9, s5 537; CI-NEXT: s_mov_b32 s4, s6 538; CI-NEXT: s_mov_b32 s5, s7 539; CI-NEXT: s_mov_b32 s6, s10 540; CI-NEXT: s_mov_b32 s7, s11 541; CI-NEXT: s_mov_b32 s3, s11 542; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 543; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 544; CI-NEXT: s_waitcnt vmcnt(0) 545; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 546; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 547; CI-NEXT: v_rcp_f32_e32 v4, v3 548; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 549; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 550; CI-NEXT: v_fma_f32 v4, v5, v4, v4 551; CI-NEXT: v_mul_f32_e32 v5, v2, v4 552; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 553; CI-NEXT: v_fma_f32 v5, v6, v4, v5 554; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 555; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 556; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 557; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 558; CI-NEXT: v_trunc_f32_e32 v2, v2 559; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 560; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 561; CI-NEXT: s_endpgm 562; 563; VI-LABEL: frem_f32: 564; VI: ; %bb.0: 565; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 566; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 567; VI-NEXT: s_waitcnt lgkmcnt(0) 568; VI-NEXT: v_mov_b32_e32 v2, s6 569; VI-NEXT: s_add_u32 s0, s0, 16 570; VI-NEXT: v_mov_b32_e32 v3, s7 571; VI-NEXT: s_addc_u32 s1, s1, 0 572; VI-NEXT: flat_load_dword v4, v[2:3] 573; VI-NEXT: v_mov_b32_e32 v3, s1 574; VI-NEXT: v_mov_b32_e32 v2, s0 575; VI-NEXT: flat_load_dword v2, v[2:3] 576; VI-NEXT: v_mov_b32_e32 v0, s4 577; VI-NEXT: v_mov_b32_e32 v1, s5 578; VI-NEXT: s_waitcnt vmcnt(0) 579; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 580; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 581; VI-NEXT: v_rcp_f32_e32 v6, v5 582; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 583; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 584; VI-NEXT: v_fma_f32 v6, v7, v6, v6 585; VI-NEXT: v_mul_f32_e32 v7, v3, v6 586; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 587; VI-NEXT: v_fma_f32 v7, v8, v6, v7 588; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 589; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 590; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 591; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 592; VI-NEXT: v_trunc_f32_e32 v3, v3 593; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 594; VI-NEXT: flat_store_dword v[0:1], v2 595; VI-NEXT: s_endpgm 596; 597; GFX9-LABEL: frem_f32: 598; GFX9: ; %bb.0: 599; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 600; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 601; GFX9-NEXT: v_mov_b32_e32 v0, 0 602; GFX9-NEXT: s_waitcnt lgkmcnt(0) 603; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 604; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 605; GFX9-NEXT: s_waitcnt vmcnt(0) 606; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 607; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 608; GFX9-NEXT: v_rcp_f32_e32 v5, v4 609; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 610; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 611; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 612; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 613; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 614; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 615; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 616; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 617; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 618; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 619; GFX9-NEXT: v_trunc_f32_e32 v3, v3 620; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 621; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 622; GFX9-NEXT: s_endpgm 623; 624; GFX10-LABEL: frem_f32: 625; GFX10: ; %bb.0: 626; GFX10-NEXT: s_clause 0x1 627; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 628; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 629; GFX10-NEXT: v_mov_b32_e32 v0, 0 630; GFX10-NEXT: s_waitcnt lgkmcnt(0) 631; GFX10-NEXT: s_clause 0x1 632; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 633; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 634; GFX10-NEXT: s_waitcnt vmcnt(0) 635; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 636; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 637; GFX10-NEXT: v_rcp_f32_e32 v5, v4 638; GFX10-NEXT: s_denorm_mode 15 639; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 640; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 641; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 642; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 643; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 644; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 645; GFX10-NEXT: s_denorm_mode 12 646; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 647; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 648; GFX10-NEXT: v_trunc_f32_e32 v3, v3 649; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 650; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 651; GFX10-NEXT: s_endpgm 652; 653; GFX11-LABEL: frem_f32: 654; GFX11: ; %bb.0: 655; GFX11-NEXT: s_clause 0x1 656; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 657; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 658; GFX11-NEXT: v_mov_b32_e32 v0, 0 659; GFX11-NEXT: s_waitcnt lgkmcnt(0) 660; GFX11-NEXT: s_clause 0x1 661; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 662; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 663; GFX11-NEXT: s_waitcnt vmcnt(0) 664; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 665; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 667; GFX11-NEXT: v_rcp_f32_e32 v5, v4 668; GFX11-NEXT: s_denorm_mode 15 669; GFX11-NEXT: s_waitcnt_depctr 0xfff 670; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 671; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 672; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 673; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5 674; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3 675; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 676; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 677; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3 678; GFX11-NEXT: s_denorm_mode 12 679; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 680; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6 681; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1 682; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 683; GFX11-NEXT: v_trunc_f32_e32 v3, v3 684; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 685; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 686; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 687; GFX11-NEXT: s_endpgm 688 float addrspace(1)* %in2) #0 { 689 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 690 %r0 = load float, float addrspace(1)* %in1, align 4 691 %r1 = load float, float addrspace(1)* %gep2, align 4 692 %r2 = frem float %r0, %r1 693 store float %r2, float addrspace(1)* %out, align 4 694 ret void 695} 696 697define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 698; SI-LABEL: fast_frem_f32: 699; SI: ; %bb.0: 700; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 701; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 702; SI-NEXT: s_mov_b32 s11, 0xf000 703; SI-NEXT: s_mov_b32 s10, -1 704; SI-NEXT: s_waitcnt lgkmcnt(0) 705; SI-NEXT: s_mov_b32 s8, s4 706; SI-NEXT: s_mov_b32 s9, s5 707; SI-NEXT: s_mov_b32 s4, s6 708; SI-NEXT: s_mov_b32 s5, s7 709; SI-NEXT: s_mov_b32 s6, s10 710; SI-NEXT: s_mov_b32 s7, s11 711; SI-NEXT: s_mov_b32 s2, s10 712; SI-NEXT: s_mov_b32 s3, s11 713; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 714; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 715; SI-NEXT: s_waitcnt vmcnt(0) 716; SI-NEXT: v_rcp_f32_e32 v2, v1 717; SI-NEXT: v_mul_f32_e32 v2, v0, v2 718; SI-NEXT: v_trunc_f32_e32 v2, v2 719; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 720; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 721; SI-NEXT: s_endpgm 722; 723; CI-LABEL: fast_frem_f32: 724; CI: ; %bb.0: 725; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 726; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 727; CI-NEXT: s_mov_b32 s11, 0xf000 728; CI-NEXT: s_mov_b32 s10, -1 729; CI-NEXT: s_mov_b32 s2, s10 730; CI-NEXT: s_waitcnt lgkmcnt(0) 731; CI-NEXT: s_mov_b32 s8, s4 732; CI-NEXT: s_mov_b32 s9, s5 733; CI-NEXT: s_mov_b32 s4, s6 734; CI-NEXT: s_mov_b32 s5, s7 735; CI-NEXT: s_mov_b32 s6, s10 736; CI-NEXT: s_mov_b32 s7, s11 737; CI-NEXT: s_mov_b32 s3, s11 738; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 739; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 740; CI-NEXT: s_waitcnt vmcnt(0) 741; CI-NEXT: v_rcp_f32_e32 v2, v1 742; CI-NEXT: v_mul_f32_e32 v2, v0, v2 743; CI-NEXT: v_trunc_f32_e32 v2, v2 744; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 745; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 746; CI-NEXT: s_endpgm 747; 748; VI-LABEL: fast_frem_f32: 749; VI: ; %bb.0: 750; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 751; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 752; VI-NEXT: s_waitcnt lgkmcnt(0) 753; VI-NEXT: v_mov_b32_e32 v2, s6 754; VI-NEXT: s_add_u32 s0, s0, 16 755; VI-NEXT: v_mov_b32_e32 v3, s7 756; VI-NEXT: s_addc_u32 s1, s1, 0 757; VI-NEXT: flat_load_dword v4, v[2:3] 758; VI-NEXT: v_mov_b32_e32 v3, s1 759; VI-NEXT: v_mov_b32_e32 v2, s0 760; VI-NEXT: flat_load_dword v2, v[2:3] 761; VI-NEXT: v_mov_b32_e32 v0, s4 762; VI-NEXT: v_mov_b32_e32 v1, s5 763; VI-NEXT: s_waitcnt vmcnt(0) 764; VI-NEXT: v_rcp_f32_e32 v3, v2 765; VI-NEXT: v_mul_f32_e32 v3, v4, v3 766; VI-NEXT: v_trunc_f32_e32 v3, v3 767; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 768; VI-NEXT: flat_store_dword v[0:1], v2 769; VI-NEXT: s_endpgm 770; 771; GFX9-LABEL: fast_frem_f32: 772; GFX9: ; %bb.0: 773; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 774; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 775; GFX9-NEXT: v_mov_b32_e32 v0, 0 776; GFX9-NEXT: s_waitcnt lgkmcnt(0) 777; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 778; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 779; GFX9-NEXT: s_waitcnt vmcnt(0) 780; GFX9-NEXT: v_rcp_f32_e32 v3, v2 781; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 782; GFX9-NEXT: v_trunc_f32_e32 v3, v3 783; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 784; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 785; GFX9-NEXT: s_endpgm 786; 787; GFX10-LABEL: fast_frem_f32: 788; GFX10: ; %bb.0: 789; GFX10-NEXT: s_clause 0x1 790; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 791; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 792; GFX10-NEXT: v_mov_b32_e32 v0, 0 793; GFX10-NEXT: s_waitcnt lgkmcnt(0) 794; GFX10-NEXT: s_clause 0x1 795; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 796; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 797; GFX10-NEXT: s_waitcnt vmcnt(0) 798; GFX10-NEXT: v_rcp_f32_e32 v3, v2 799; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 800; GFX10-NEXT: v_trunc_f32_e32 v3, v3 801; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 802; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 803; GFX10-NEXT: s_endpgm 804; 805; GFX11-LABEL: fast_frem_f32: 806; GFX11: ; %bb.0: 807; GFX11-NEXT: s_clause 0x1 808; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 809; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 810; GFX11-NEXT: v_mov_b32_e32 v0, 0 811; GFX11-NEXT: s_waitcnt lgkmcnt(0) 812; GFX11-NEXT: s_clause 0x1 813; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 814; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 815; GFX11-NEXT: s_waitcnt vmcnt(0) 816; GFX11-NEXT: v_rcp_f32_e32 v3, v2 817; GFX11-NEXT: s_waitcnt_depctr 0xfff 818; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 819; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 820; GFX11-NEXT: v_trunc_f32_e32 v3, v3 821; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 822; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 823; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 824; GFX11-NEXT: s_endpgm 825 float addrspace(1)* %in2) #0 { 826 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 827 %r0 = load float, float addrspace(1)* %in1, align 4 828 %r1 = load float, float addrspace(1)* %gep2, align 4 829 %r2 = frem fast float %r0, %r1 830 store float %r2, float addrspace(1)* %out, align 4 831 ret void 832} 833 834define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 835; SI-LABEL: unsafe_frem_f32: 836; SI: ; %bb.0: 837; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 838; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 839; SI-NEXT: s_mov_b32 s11, 0xf000 840; SI-NEXT: s_mov_b32 s10, -1 841; SI-NEXT: s_waitcnt lgkmcnt(0) 842; SI-NEXT: s_mov_b32 s8, s4 843; SI-NEXT: s_mov_b32 s9, s5 844; SI-NEXT: s_mov_b32 s4, s6 845; SI-NEXT: s_mov_b32 s5, s7 846; SI-NEXT: s_mov_b32 s6, s10 847; SI-NEXT: s_mov_b32 s7, s11 848; SI-NEXT: s_mov_b32 s2, s10 849; SI-NEXT: s_mov_b32 s3, s11 850; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 851; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 852; SI-NEXT: s_waitcnt vmcnt(0) 853; SI-NEXT: v_rcp_f32_e32 v2, v1 854; SI-NEXT: v_mul_f32_e32 v2, v0, v2 855; SI-NEXT: v_trunc_f32_e32 v2, v2 856; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 857; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 858; SI-NEXT: s_endpgm 859; 860; CI-LABEL: unsafe_frem_f32: 861; CI: ; %bb.0: 862; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 863; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 864; CI-NEXT: s_mov_b32 s11, 0xf000 865; CI-NEXT: s_mov_b32 s10, -1 866; CI-NEXT: s_mov_b32 s2, s10 867; CI-NEXT: s_waitcnt lgkmcnt(0) 868; CI-NEXT: s_mov_b32 s8, s4 869; CI-NEXT: s_mov_b32 s9, s5 870; CI-NEXT: s_mov_b32 s4, s6 871; CI-NEXT: s_mov_b32 s5, s7 872; CI-NEXT: s_mov_b32 s6, s10 873; CI-NEXT: s_mov_b32 s7, s11 874; CI-NEXT: s_mov_b32 s3, s11 875; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 876; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 877; CI-NEXT: s_waitcnt vmcnt(0) 878; CI-NEXT: v_rcp_f32_e32 v2, v1 879; CI-NEXT: v_mul_f32_e32 v2, v0, v2 880; CI-NEXT: v_trunc_f32_e32 v2, v2 881; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 882; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 883; CI-NEXT: s_endpgm 884; 885; VI-LABEL: unsafe_frem_f32: 886; VI: ; %bb.0: 887; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 888; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 889; VI-NEXT: s_waitcnt lgkmcnt(0) 890; VI-NEXT: v_mov_b32_e32 v2, s6 891; VI-NEXT: s_add_u32 s0, s0, 16 892; VI-NEXT: v_mov_b32_e32 v3, s7 893; VI-NEXT: s_addc_u32 s1, s1, 0 894; VI-NEXT: flat_load_dword v4, v[2:3] 895; VI-NEXT: v_mov_b32_e32 v3, s1 896; VI-NEXT: v_mov_b32_e32 v2, s0 897; VI-NEXT: flat_load_dword v2, v[2:3] 898; VI-NEXT: v_mov_b32_e32 v0, s4 899; VI-NEXT: v_mov_b32_e32 v1, s5 900; VI-NEXT: s_waitcnt vmcnt(0) 901; VI-NEXT: v_rcp_f32_e32 v3, v2 902; VI-NEXT: v_mul_f32_e32 v3, v4, v3 903; VI-NEXT: v_trunc_f32_e32 v3, v3 904; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 905; VI-NEXT: flat_store_dword v[0:1], v2 906; VI-NEXT: s_endpgm 907; 908; GFX9-LABEL: unsafe_frem_f32: 909; GFX9: ; %bb.0: 910; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 911; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 912; GFX9-NEXT: v_mov_b32_e32 v0, 0 913; GFX9-NEXT: s_waitcnt lgkmcnt(0) 914; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 915; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 916; GFX9-NEXT: s_waitcnt vmcnt(0) 917; GFX9-NEXT: v_rcp_f32_e32 v3, v2 918; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 919; GFX9-NEXT: v_trunc_f32_e32 v3, v3 920; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 921; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 922; GFX9-NEXT: s_endpgm 923; 924; GFX10-LABEL: unsafe_frem_f32: 925; GFX10: ; %bb.0: 926; GFX10-NEXT: s_clause 0x1 927; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 928; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 929; GFX10-NEXT: v_mov_b32_e32 v0, 0 930; GFX10-NEXT: s_waitcnt lgkmcnt(0) 931; GFX10-NEXT: s_clause 0x1 932; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 933; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 934; GFX10-NEXT: s_waitcnt vmcnt(0) 935; GFX10-NEXT: v_rcp_f32_e32 v3, v2 936; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 937; GFX10-NEXT: v_trunc_f32_e32 v3, v3 938; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 939; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 940; GFX10-NEXT: s_endpgm 941; 942; GFX11-LABEL: unsafe_frem_f32: 943; GFX11: ; %bb.0: 944; GFX11-NEXT: s_clause 0x1 945; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 946; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 947; GFX11-NEXT: v_mov_b32_e32 v0, 0 948; GFX11-NEXT: s_waitcnt lgkmcnt(0) 949; GFX11-NEXT: s_clause 0x1 950; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 951; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 952; GFX11-NEXT: s_waitcnt vmcnt(0) 953; GFX11-NEXT: v_rcp_f32_e32 v3, v2 954; GFX11-NEXT: s_waitcnt_depctr 0xfff 955; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 956; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 957; GFX11-NEXT: v_trunc_f32_e32 v3, v3 958; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 959; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 960; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 961; GFX11-NEXT: s_endpgm 962 float addrspace(1)* %in2) #1 { 963 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 964 %r0 = load float, float addrspace(1)* %in1, align 4 965 %r1 = load float, float addrspace(1)* %gep2, align 4 966 %r2 = frem afn float %r0, %r1 967 store float %r2, float addrspace(1)* %out, align 4 968 ret void 969} 970 971define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 972; SI-LABEL: frem_f64: 973; SI: ; %bb.0: 974; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 975; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 976; SI-NEXT: s_mov_b32 s7, 0xf000 977; SI-NEXT: s_mov_b32 s6, -1 978; SI-NEXT: s_waitcnt lgkmcnt(0) 979; SI-NEXT: s_mov_b32 s4, s8 980; SI-NEXT: s_mov_b32 s5, s9 981; SI-NEXT: s_mov_b32 s8, s10 982; SI-NEXT: s_mov_b32 s9, s11 983; SI-NEXT: s_mov_b32 s10, s6 984; SI-NEXT: s_mov_b32 s11, s7 985; SI-NEXT: s_mov_b32 s2, s6 986; SI-NEXT: s_mov_b32 s3, s7 987; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 988; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 989; SI-NEXT: s_waitcnt vmcnt(0) 990; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 991; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 992; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 993; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 994; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 995; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 996; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] 997; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 998; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] 999; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1000; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 1001; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 1002; SI-NEXT: s_nop 1 1003; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] 1004; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1005; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1006; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1007; SI-NEXT: s_mov_b32 s1, 0xfffff 1008; SI-NEXT: s_mov_b32 s0, s6 1009; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1010; SI-NEXT: v_not_b32_e32 v6, v6 1011; SI-NEXT: v_and_b32_e32 v6, v4, v6 1012; SI-NEXT: v_not_b32_e32 v7, v7 1013; SI-NEXT: v_and_b32_e32 v7, v5, v7 1014; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1015; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1016; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1017; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1018; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1019; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1020; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1021; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1022; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1023; SI-NEXT: s_endpgm 1024; 1025; CI-LABEL: frem_f64: 1026; CI: ; %bb.0: 1027; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1028; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1029; CI-NEXT: s_mov_b32 s11, 0xf000 1030; CI-NEXT: s_mov_b32 s10, -1 1031; CI-NEXT: s_mov_b32 s2, s10 1032; CI-NEXT: s_waitcnt lgkmcnt(0) 1033; CI-NEXT: s_mov_b32 s8, s4 1034; CI-NEXT: s_mov_b32 s9, s5 1035; CI-NEXT: s_mov_b32 s4, s6 1036; CI-NEXT: s_mov_b32 s5, s7 1037; CI-NEXT: s_mov_b32 s6, s10 1038; CI-NEXT: s_mov_b32 s7, s11 1039; CI-NEXT: s_mov_b32 s3, s11 1040; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1041; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1042; CI-NEXT: s_waitcnt vmcnt(0) 1043; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 1044; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1045; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1046; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1047; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1048; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1049; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 1050; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1051; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1052; CI-NEXT: s_nop 1 1053; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1054; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1055; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1056; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1057; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1058; CI-NEXT: s_endpgm 1059; 1060; VI-LABEL: frem_f64: 1061; VI: ; %bb.0: 1062; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1063; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1064; VI-NEXT: s_waitcnt lgkmcnt(0) 1065; VI-NEXT: v_mov_b32_e32 v2, s6 1066; VI-NEXT: v_mov_b32_e32 v3, s7 1067; VI-NEXT: v_mov_b32_e32 v4, s0 1068; VI-NEXT: v_mov_b32_e32 v5, s1 1069; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1070; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1071; VI-NEXT: v_mov_b32_e32 v0, s4 1072; VI-NEXT: v_mov_b32_e32 v1, s5 1073; VI-NEXT: s_waitcnt vmcnt(0) 1074; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] 1075; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 1076; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1077; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1078; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1079; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1080; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] 1081; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 1082; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 1083; VI-NEXT: s_nop 1 1084; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 1085; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] 1086; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1087; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1088; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1089; VI-NEXT: s_endpgm 1090; 1091; GFX9-LABEL: frem_f64: 1092; GFX9: ; %bb.0: 1093; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1094; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1095; GFX9-NEXT: v_mov_b32_e32 v12, 0 1096; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 1098; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 1099; GFX9-NEXT: s_waitcnt vmcnt(0) 1100; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 1101; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1102; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1103; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1104; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1105; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1106; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 1107; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1108; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1109; GFX9-NEXT: s_nop 1 1110; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1111; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1112; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1113; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1114; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 1115; GFX9-NEXT: s_endpgm 1116; 1117; GFX10-LABEL: frem_f64: 1118; GFX10: ; %bb.0: 1119; GFX10-NEXT: s_clause 0x1 1120; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1121; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1122; GFX10-NEXT: v_mov_b32_e32 v12, 0 1123; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX10-NEXT: s_clause 0x1 1125; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 1126; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 1127; GFX10-NEXT: s_waitcnt vmcnt(0) 1128; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] 1129; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1130; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1131; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1132; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1133; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1134; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 1135; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1136; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1137; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1138; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1139; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1140; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1141; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 1142; GFX10-NEXT: s_endpgm 1143; 1144; GFX11-LABEL: frem_f64: 1145; GFX11: ; %bb.0: 1146; GFX11-NEXT: s_clause 0x1 1147; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1148; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1149; GFX11-NEXT: v_mov_b32_e32 v12, 0 1150; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX11-NEXT: s_clause 0x1 1152; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7] 1153; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1] 1154; GFX11-NEXT: s_waitcnt vmcnt(0) 1155; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] 1156; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1157; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1158; GFX11-NEXT: s_waitcnt_depctr 0xfff 1159; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1160; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1161; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1162; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1163; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1164; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 1165; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1166; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1167; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1168; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1169; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1170; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1171; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1172; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1173; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1174; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5] 1175; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1176; GFX11-NEXT: s_endpgm 1177 double addrspace(1)* %in2) #0 { 1178 %r0 = load double, double addrspace(1)* %in1, align 8 1179 %r1 = load double, double addrspace(1)* %in2, align 8 1180 %r2 = frem double %r0, %r1 1181 store double %r2, double addrspace(1)* %out, align 8 1182 ret void 1183} 1184 1185define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1186; SI-LABEL: fast_frem_f64: 1187; SI: ; %bb.0: 1188; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1189; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1190; SI-NEXT: s_mov_b32 s7, 0xf000 1191; SI-NEXT: s_mov_b32 s6, -1 1192; SI-NEXT: s_waitcnt lgkmcnt(0) 1193; SI-NEXT: s_mov_b32 s4, s8 1194; SI-NEXT: s_mov_b32 s5, s9 1195; SI-NEXT: s_mov_b32 s8, s10 1196; SI-NEXT: s_mov_b32 s9, s11 1197; SI-NEXT: s_mov_b32 s10, s6 1198; SI-NEXT: s_mov_b32 s11, s7 1199; SI-NEXT: s_mov_b32 s2, s6 1200; SI-NEXT: s_mov_b32 s3, s7 1201; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1202; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1203; SI-NEXT: s_waitcnt vmcnt(0) 1204; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1205; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1206; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1207; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1208; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1209; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1210; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1211; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1212; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1213; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1214; SI-NEXT: s_mov_b32 s1, 0xfffff 1215; SI-NEXT: s_mov_b32 s0, s6 1216; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1217; SI-NEXT: v_not_b32_e32 v6, v6 1218; SI-NEXT: v_and_b32_e32 v6, v4, v6 1219; SI-NEXT: v_not_b32_e32 v7, v7 1220; SI-NEXT: v_and_b32_e32 v7, v5, v7 1221; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1222; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1223; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1224; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1225; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1226; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1227; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1228; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1229; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1230; SI-NEXT: s_endpgm 1231; 1232; CI-LABEL: fast_frem_f64: 1233; CI: ; %bb.0: 1234; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1235; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1236; CI-NEXT: s_mov_b32 s11, 0xf000 1237; CI-NEXT: s_mov_b32 s10, -1 1238; CI-NEXT: s_mov_b32 s2, s10 1239; CI-NEXT: s_waitcnt lgkmcnt(0) 1240; CI-NEXT: s_mov_b32 s8, s4 1241; CI-NEXT: s_mov_b32 s9, s5 1242; CI-NEXT: s_mov_b32 s4, s6 1243; CI-NEXT: s_mov_b32 s5, s7 1244; CI-NEXT: s_mov_b32 s6, s10 1245; CI-NEXT: s_mov_b32 s7, s11 1246; CI-NEXT: s_mov_b32 s3, s11 1247; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1248; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1249; CI-NEXT: s_waitcnt vmcnt(0) 1250; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1251; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1252; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1253; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1254; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1255; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1256; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1257; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1258; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1259; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1260; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1261; CI-NEXT: s_endpgm 1262; 1263; VI-LABEL: fast_frem_f64: 1264; VI: ; %bb.0: 1265; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1266; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1267; VI-NEXT: s_waitcnt lgkmcnt(0) 1268; VI-NEXT: v_mov_b32_e32 v2, s6 1269; VI-NEXT: v_mov_b32_e32 v3, s7 1270; VI-NEXT: v_mov_b32_e32 v4, s0 1271; VI-NEXT: v_mov_b32_e32 v5, s1 1272; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1273; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1274; VI-NEXT: v_mov_b32_e32 v0, s4 1275; VI-NEXT: v_mov_b32_e32 v1, s5 1276; VI-NEXT: s_waitcnt vmcnt(0) 1277; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1278; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1279; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1280; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1281; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1282; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1283; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1284; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1285; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1286; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1287; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1288; VI-NEXT: s_endpgm 1289; 1290; GFX9-LABEL: fast_frem_f64: 1291; GFX9: ; %bb.0: 1292; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1293; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1294; GFX9-NEXT: v_mov_b32_e32 v10, 0 1295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1297; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1298; GFX9-NEXT: s_waitcnt vmcnt(0) 1299; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1300; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1301; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1302; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1303; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1304; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1305; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1306; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1307; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1308; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1309; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1310; GFX9-NEXT: s_endpgm 1311; 1312; GFX10-LABEL: fast_frem_f64: 1313; GFX10: ; %bb.0: 1314; GFX10-NEXT: s_clause 0x1 1315; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1316; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1317; GFX10-NEXT: v_mov_b32_e32 v10, 0 1318; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX10-NEXT: s_clause 0x1 1320; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1321; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1322; GFX10-NEXT: s_waitcnt vmcnt(0) 1323; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1324; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1325; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1326; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1327; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1328; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1329; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1330; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1331; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1332; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1333; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1334; GFX10-NEXT: s_endpgm 1335; 1336; GFX11-LABEL: fast_frem_f64: 1337; GFX11: ; %bb.0: 1338; GFX11-NEXT: s_clause 0x1 1339; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1340; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1341; GFX11-NEXT: v_mov_b32_e32 v10, 0 1342; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX11-NEXT: s_clause 0x1 1344; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] 1345; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] 1346; GFX11-NEXT: s_waitcnt vmcnt(0) 1347; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1348; GFX11-NEXT: s_waitcnt_depctr 0xfff 1349; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1350; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1351; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1352; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1354; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1355; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1356; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1357; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1358; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1359; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1360; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1361; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1362; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] 1363; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1364; GFX11-NEXT: s_endpgm 1365 double addrspace(1)* %in2) #0 { 1366 %r0 = load double, double addrspace(1)* %in1, align 8 1367 %r1 = load double, double addrspace(1)* %in2, align 8 1368 %r2 = frem fast double %r0, %r1 1369 store double %r2, double addrspace(1)* %out, align 8 1370 ret void 1371} 1372 1373define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1374; SI-LABEL: unsafe_frem_f64: 1375; SI: ; %bb.0: 1376; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1377; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1378; SI-NEXT: s_mov_b32 s7, 0xf000 1379; SI-NEXT: s_mov_b32 s6, -1 1380; SI-NEXT: s_waitcnt lgkmcnt(0) 1381; SI-NEXT: s_mov_b32 s4, s8 1382; SI-NEXT: s_mov_b32 s5, s9 1383; SI-NEXT: s_mov_b32 s8, s10 1384; SI-NEXT: s_mov_b32 s9, s11 1385; SI-NEXT: s_mov_b32 s10, s6 1386; SI-NEXT: s_mov_b32 s11, s7 1387; SI-NEXT: s_mov_b32 s2, s6 1388; SI-NEXT: s_mov_b32 s3, s7 1389; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1390; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1391; SI-NEXT: s_waitcnt vmcnt(0) 1392; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1393; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1394; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1395; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1396; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1397; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1398; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1399; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1400; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1401; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1402; SI-NEXT: s_mov_b32 s1, 0xfffff 1403; SI-NEXT: s_mov_b32 s0, s6 1404; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1405; SI-NEXT: v_not_b32_e32 v6, v6 1406; SI-NEXT: v_and_b32_e32 v6, v4, v6 1407; SI-NEXT: v_not_b32_e32 v7, v7 1408; SI-NEXT: v_and_b32_e32 v7, v5, v7 1409; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1410; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1411; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1412; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1413; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1414; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1415; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1416; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1417; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1418; SI-NEXT: s_endpgm 1419; 1420; CI-LABEL: unsafe_frem_f64: 1421; CI: ; %bb.0: 1422; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1423; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1424; CI-NEXT: s_mov_b32 s11, 0xf000 1425; CI-NEXT: s_mov_b32 s10, -1 1426; CI-NEXT: s_mov_b32 s2, s10 1427; CI-NEXT: s_waitcnt lgkmcnt(0) 1428; CI-NEXT: s_mov_b32 s8, s4 1429; CI-NEXT: s_mov_b32 s9, s5 1430; CI-NEXT: s_mov_b32 s4, s6 1431; CI-NEXT: s_mov_b32 s5, s7 1432; CI-NEXT: s_mov_b32 s6, s10 1433; CI-NEXT: s_mov_b32 s7, s11 1434; CI-NEXT: s_mov_b32 s3, s11 1435; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1436; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1437; CI-NEXT: s_waitcnt vmcnt(0) 1438; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1439; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1440; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1441; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1442; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1443; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1444; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1445; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1446; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1447; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1448; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1449; CI-NEXT: s_endpgm 1450; 1451; VI-LABEL: unsafe_frem_f64: 1452; VI: ; %bb.0: 1453; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1454; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1455; VI-NEXT: s_waitcnt lgkmcnt(0) 1456; VI-NEXT: v_mov_b32_e32 v2, s6 1457; VI-NEXT: v_mov_b32_e32 v3, s7 1458; VI-NEXT: v_mov_b32_e32 v4, s0 1459; VI-NEXT: v_mov_b32_e32 v5, s1 1460; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1461; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1462; VI-NEXT: v_mov_b32_e32 v0, s4 1463; VI-NEXT: v_mov_b32_e32 v1, s5 1464; VI-NEXT: s_waitcnt vmcnt(0) 1465; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1466; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1467; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1468; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1469; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1470; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1471; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1472; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1473; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1474; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1475; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1476; VI-NEXT: s_endpgm 1477; 1478; GFX9-LABEL: unsafe_frem_f64: 1479; GFX9: ; %bb.0: 1480; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1481; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1482; GFX9-NEXT: v_mov_b32_e32 v10, 0 1483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1485; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1486; GFX9-NEXT: s_waitcnt vmcnt(0) 1487; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1488; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1489; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1490; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1491; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1492; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1493; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1494; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1495; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1496; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1497; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1498; GFX9-NEXT: s_endpgm 1499; 1500; GFX10-LABEL: unsafe_frem_f64: 1501; GFX10: ; %bb.0: 1502; GFX10-NEXT: s_clause 0x1 1503; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1504; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1505; GFX10-NEXT: v_mov_b32_e32 v10, 0 1506; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX10-NEXT: s_clause 0x1 1508; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1509; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1510; GFX10-NEXT: s_waitcnt vmcnt(0) 1511; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1512; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1513; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1514; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1515; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1516; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1517; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1518; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1519; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1520; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1521; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1522; GFX10-NEXT: s_endpgm 1523; 1524; GFX11-LABEL: unsafe_frem_f64: 1525; GFX11: ; %bb.0: 1526; GFX11-NEXT: s_clause 0x1 1527; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1528; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1529; GFX11-NEXT: v_mov_b32_e32 v10, 0 1530; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1531; GFX11-NEXT: s_clause 0x1 1532; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] 1533; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] 1534; GFX11-NEXT: s_waitcnt vmcnt(0) 1535; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1536; GFX11-NEXT: s_waitcnt_depctr 0xfff 1537; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1538; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1539; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1540; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1541; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1542; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1543; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1544; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1545; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1546; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1547; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1548; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1549; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1550; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] 1551; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1552; GFX11-NEXT: s_endpgm 1553 double addrspace(1)* %in2) #1 { 1554 %r0 = load double, double addrspace(1)* %in1, align 8 1555 %r1 = load double, double addrspace(1)* %in2, align 8 1556 %r2 = frem afn double %r0, %r1 1557 store double %r2, double addrspace(1)* %out, align 8 1558 ret void 1559} 1560 1561define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 1562; SI-LABEL: frem_v2f16: 1563; SI: ; %bb.0: 1564; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1565; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1566; SI-NEXT: s_mov_b32 s3, 0xf000 1567; SI-NEXT: s_mov_b32 s2, -1 1568; SI-NEXT: s_waitcnt lgkmcnt(0) 1569; SI-NEXT: s_mov_b32 s0, s4 1570; SI-NEXT: s_mov_b32 s1, s5 1571; SI-NEXT: s_mov_b32 s4, s6 1572; SI-NEXT: s_mov_b32 s5, s7 1573; SI-NEXT: s_mov_b32 s6, s2 1574; SI-NEXT: s_mov_b32 s7, s3 1575; SI-NEXT: s_mov_b32 s10, s2 1576; SI-NEXT: s_mov_b32 s11, s3 1577; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1578; SI-NEXT: s_waitcnt vmcnt(0) 1579; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1580; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1581; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1582; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1583; SI-NEXT: s_waitcnt vmcnt(0) 1584; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 1585; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1586; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1587; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1588; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1589; SI-NEXT: v_rcp_f32_e32 v6, v5 1590; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1591; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1592; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1593; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1594; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1595; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1596; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1597; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1598; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1599; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1600; SI-NEXT: v_trunc_f32_e32 v4, v4 1601; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 1602; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1603; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1604; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1605; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1606; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1607; SI-NEXT: v_rcp_f32_e32 v5, v4 1608; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1609; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1610; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1611; SI-NEXT: v_mul_f32_e32 v6, v2, v5 1612; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 1613; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1614; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 1615; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1616; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1617; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1618; SI-NEXT: v_trunc_f32_e32 v2, v2 1619; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 1620; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1621; SI-NEXT: v_or_b32_e32 v0, v1, v0 1622; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1623; SI-NEXT: s_endpgm 1624; 1625; CI-LABEL: frem_v2f16: 1626; CI: ; %bb.0: 1627; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1628; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1629; CI-NEXT: s_mov_b32 s3, 0xf000 1630; CI-NEXT: s_mov_b32 s2, -1 1631; CI-NEXT: s_mov_b32 s10, s2 1632; CI-NEXT: s_waitcnt lgkmcnt(0) 1633; CI-NEXT: s_mov_b32 s0, s4 1634; CI-NEXT: s_mov_b32 s1, s5 1635; CI-NEXT: s_mov_b32 s4, s6 1636; CI-NEXT: s_mov_b32 s5, s7 1637; CI-NEXT: s_mov_b32 s6, s2 1638; CI-NEXT: s_mov_b32 s7, s3 1639; CI-NEXT: s_mov_b32 s11, s3 1640; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1641; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1642; CI-NEXT: s_waitcnt vmcnt(1) 1643; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1644; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1645; CI-NEXT: s_waitcnt vmcnt(0) 1646; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 1647; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1648; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1649; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1650; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1651; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1652; CI-NEXT: v_rcp_f32_e32 v6, v5 1653; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1654; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1655; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1656; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1657; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1658; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1659; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1660; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1661; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1662; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1663; CI-NEXT: v_trunc_f32_e32 v4, v4 1664; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 1665; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1666; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1667; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1668; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1669; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1670; CI-NEXT: v_rcp_f32_e32 v5, v4 1671; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1672; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1673; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1674; CI-NEXT: v_mul_f32_e32 v6, v2, v5 1675; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 1676; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1677; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 1678; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1679; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1680; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1681; CI-NEXT: v_trunc_f32_e32 v2, v2 1682; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 1683; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1684; CI-NEXT: v_or_b32_e32 v0, v1, v0 1685; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1686; CI-NEXT: s_endpgm 1687; 1688; VI-LABEL: frem_v2f16: 1689; VI: ; %bb.0: 1690; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1691; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1692; VI-NEXT: s_waitcnt lgkmcnt(0) 1693; VI-NEXT: v_mov_b32_e32 v2, s6 1694; VI-NEXT: s_add_u32 s0, s0, 16 1695; VI-NEXT: v_mov_b32_e32 v3, s7 1696; VI-NEXT: s_addc_u32 s1, s1, 0 1697; VI-NEXT: flat_load_dword v4, v[2:3] 1698; VI-NEXT: v_mov_b32_e32 v3, s1 1699; VI-NEXT: v_mov_b32_e32 v2, s0 1700; VI-NEXT: flat_load_dword v2, v[2:3] 1701; VI-NEXT: v_mov_b32_e32 v0, s4 1702; VI-NEXT: v_mov_b32_e32 v1, s5 1703; VI-NEXT: s_waitcnt vmcnt(1) 1704; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1705; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 1706; VI-NEXT: s_waitcnt vmcnt(0) 1707; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1708; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1709; VI-NEXT: v_rcp_f32_e32 v7, v7 1710; VI-NEXT: v_mul_f32_e32 v5, v5, v7 1711; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1712; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 1713; VI-NEXT: v_trunc_f16_e32 v5, v5 1714; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 1715; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1716; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 1717; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1718; VI-NEXT: v_rcp_f32_e32 v6, v6 1719; VI-NEXT: v_mul_f32_e32 v5, v5, v6 1720; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1721; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 1722; VI-NEXT: v_trunc_f16_e32 v5, v5 1723; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 1724; VI-NEXT: v_or_b32_e32 v2, v2, v3 1725; VI-NEXT: flat_store_dword v[0:1], v2 1726; VI-NEXT: s_endpgm 1727; 1728; GFX9-LABEL: frem_v2f16: 1729; GFX9: ; %bb.0: 1730; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1731; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1732; GFX9-NEXT: v_mov_b32_e32 v0, 0 1733; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1734; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 1735; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1736; GFX9-NEXT: s_waitcnt vmcnt(1) 1737; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 1738; GFX9-NEXT: s_waitcnt vmcnt(0) 1739; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 1740; GFX9-NEXT: v_rcp_f32_e32 v4, v4 1741; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 1742; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1743; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1744; GFX9-NEXT: v_trunc_f16_e32 v3, v3 1745; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 1746; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1747; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 1748; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1749; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 1750; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1751; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 1752; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 1753; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1754; GFX9-NEXT: v_trunc_f16_e32 v4, v4 1755; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 1756; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 1757; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 1758; GFX9-NEXT: s_endpgm 1759; 1760; GFX10-LABEL: frem_v2f16: 1761; GFX10: ; %bb.0: 1762; GFX10-NEXT: s_clause 0x1 1763; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1764; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1765; GFX10-NEXT: v_mov_b32_e32 v0, 0 1766; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1767; GFX10-NEXT: s_clause 0x1 1768; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 1769; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1770; GFX10-NEXT: s_waitcnt vmcnt(1) 1771; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 1772; GFX10-NEXT: s_waitcnt vmcnt(0) 1773; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 1774; GFX10-NEXT: v_rcp_f32_e32 v4, v4 1775; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 1776; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 1777; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1778; GFX10-NEXT: v_trunc_f16_e32 v3, v3 1779; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 1780; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1781; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1782; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 1783; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 1784; GFX10-NEXT: v_rcp_f32_e32 v5, v5 1785; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 1786; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 1787; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1788; GFX10-NEXT: v_trunc_f16_e32 v4, v4 1789; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 1790; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 1791; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 1792; GFX10-NEXT: s_endpgm 1793; 1794; GFX11-LABEL: frem_v2f16: 1795; GFX11: ; %bb.0: 1796; GFX11-NEXT: s_clause 0x1 1797; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1798; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1799; GFX11-NEXT: v_mov_b32_e32 v0, 0 1800; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1801; GFX11-NEXT: s_clause 0x1 1802; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 1803; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 1804; GFX11-NEXT: s_waitcnt vmcnt(1) 1805; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 1806; GFX11-NEXT: s_waitcnt vmcnt(0) 1807; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 1808; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1809; GFX11-NEXT: v_rcp_f32_e32 v4, v4 1810; GFX11-NEXT: s_waitcnt_depctr 0xfff 1811; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 1812; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 1813; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1814; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1815; GFX11-NEXT: v_trunc_f16_e32 v3, v3 1816; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 1817; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 1818; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1819; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1820; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 1821; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1822; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 1823; GFX11-NEXT: v_rcp_f32_e32 v5, v5 1824; GFX11-NEXT: s_waitcnt_depctr 0xfff 1825; GFX11-NEXT: v_mul_f32_e32 v4, v4, v5 1826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1827; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 1828; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1829; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1830; GFX11-NEXT: v_trunc_f16_e32 v4, v4 1831; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1 1832; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1833; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 1834; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 1835; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1836; GFX11-NEXT: s_endpgm 1837 <2 x half> addrspace(1)* %in2) #0 { 1838 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 1839 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 1840 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8 1841 %r2 = frem <2 x half> %r0, %r1 1842 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8 1843 ret void 1844} 1845 1846define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, 1847; SI-LABEL: frem_v4f16: 1848; SI: ; %bb.0: 1849; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1850; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1851; SI-NEXT: s_mov_b32 s3, 0xf000 1852; SI-NEXT: s_mov_b32 s2, -1 1853; SI-NEXT: s_waitcnt lgkmcnt(0) 1854; SI-NEXT: s_mov_b32 s0, s4 1855; SI-NEXT: s_mov_b32 s1, s5 1856; SI-NEXT: s_mov_b32 s4, s6 1857; SI-NEXT: s_mov_b32 s5, s7 1858; SI-NEXT: s_mov_b32 s6, s2 1859; SI-NEXT: s_mov_b32 s7, s3 1860; SI-NEXT: s_mov_b32 s10, s2 1861; SI-NEXT: s_mov_b32 s11, s3 1862; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1863; SI-NEXT: s_waitcnt vmcnt(0) 1864; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1865; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1866; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 1867; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 1868; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1869; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 1870; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1871; SI-NEXT: s_waitcnt vmcnt(0) 1872; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1873; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1874; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1875; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1876; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1877; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1878; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1879; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1880; SI-NEXT: v_rcp_f32_e32 v10, v9 1881; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1882; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1883; SI-NEXT: v_fma_f32 v10, v11, v10, v10 1884; SI-NEXT: v_mul_f32_e32 v11, v8, v10 1885; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 1886; SI-NEXT: v_fma_f32 v11, v12, v10, v11 1887; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 1888; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1889; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1890; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1891; SI-NEXT: v_trunc_f32_e32 v8, v8 1892; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 1893; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1894; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1895; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1896; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1897; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1898; SI-NEXT: v_rcp_f32_e32 v9, v8 1899; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1900; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1901; SI-NEXT: v_fma_f32 v9, v10, v9, v9 1902; SI-NEXT: v_mul_f32_e32 v10, v5, v9 1903; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 1904; SI-NEXT: v_fma_f32 v10, v11, v9, v10 1905; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 1906; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1907; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1908; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1909; SI-NEXT: v_trunc_f32_e32 v5, v5 1910; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1911; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1912; SI-NEXT: v_or_b32_e32 v1, v4, v1 1913; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1914; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1915; SI-NEXT: v_rcp_f32_e32 v7, v5 1916; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1917; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1918; SI-NEXT: v_fma_f32 v7, v8, v7, v7 1919; SI-NEXT: v_mul_f32_e32 v8, v4, v7 1920; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 1921; SI-NEXT: v_fma_f32 v8, v9, v7, v8 1922; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 1923; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1924; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1925; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1926; SI-NEXT: v_trunc_f32_e32 v4, v4 1927; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 1928; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1929; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1930; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1931; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1932; SI-NEXT: v_rcp_f32_e32 v5, v4 1933; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1934; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1935; SI-NEXT: v_fma_f32 v5, v7, v5, v5 1936; SI-NEXT: v_mul_f32_e32 v7, v3, v5 1937; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 1938; SI-NEXT: v_fma_f32 v7, v8, v5, v7 1939; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 1940; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1941; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1942; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1943; SI-NEXT: v_trunc_f32_e32 v3, v3 1944; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 1945; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1946; SI-NEXT: v_or_b32_e32 v0, v2, v0 1947; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1948; SI-NEXT: s_endpgm 1949; 1950; CI-LABEL: frem_v4f16: 1951; CI: ; %bb.0: 1952; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1953; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1954; CI-NEXT: s_mov_b32 s3, 0xf000 1955; CI-NEXT: s_mov_b32 s2, -1 1956; CI-NEXT: s_mov_b32 s10, s2 1957; CI-NEXT: s_waitcnt lgkmcnt(0) 1958; CI-NEXT: s_mov_b32 s0, s4 1959; CI-NEXT: s_mov_b32 s1, s5 1960; CI-NEXT: s_mov_b32 s4, s6 1961; CI-NEXT: s_mov_b32 s5, s7 1962; CI-NEXT: s_mov_b32 s6, s2 1963; CI-NEXT: s_mov_b32 s7, s3 1964; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1965; CI-NEXT: s_mov_b32 s11, s3 1966; CI-NEXT: s_waitcnt vmcnt(0) 1967; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 1968; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1969; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 1970; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1971; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 1972; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 1973; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1974; CI-NEXT: s_waitcnt vmcnt(0) 1975; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1976; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1977; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1978; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1979; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1980; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1981; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1982; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1983; CI-NEXT: v_rcp_f32_e32 v10, v9 1984; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1985; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1986; CI-NEXT: v_fma_f32 v10, v11, v10, v10 1987; CI-NEXT: v_mul_f32_e32 v11, v8, v10 1988; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 1989; CI-NEXT: v_fma_f32 v11, v12, v10, v11 1990; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 1991; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1992; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1993; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1994; CI-NEXT: v_trunc_f32_e32 v8, v8 1995; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 1996; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1997; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1998; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1999; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2000; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2001; CI-NEXT: v_rcp_f32_e32 v9, v8 2002; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2003; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2004; CI-NEXT: v_fma_f32 v9, v10, v9, v9 2005; CI-NEXT: v_mul_f32_e32 v10, v5, v9 2006; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 2007; CI-NEXT: v_fma_f32 v10, v11, v9, v10 2008; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 2009; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2010; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 2011; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 2012; CI-NEXT: v_trunc_f32_e32 v5, v5 2013; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2014; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 2015; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2016; CI-NEXT: v_or_b32_e32 v1, v4, v1 2017; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 2018; CI-NEXT: v_rcp_f32_e32 v7, v5 2019; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2020; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 2021; CI-NEXT: v_fma_f32 v7, v8, v7, v7 2022; CI-NEXT: v_mul_f32_e32 v8, v4, v7 2023; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 2024; CI-NEXT: v_fma_f32 v8, v9, v7, v8 2025; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 2026; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2027; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 2028; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 2029; CI-NEXT: v_trunc_f32_e32 v4, v4 2030; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 2031; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 2032; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 2033; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2034; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2035; CI-NEXT: v_rcp_f32_e32 v5, v4 2036; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2037; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 2038; CI-NEXT: v_fma_f32 v5, v7, v5, v5 2039; CI-NEXT: v_mul_f32_e32 v7, v3, v5 2040; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 2041; CI-NEXT: v_fma_f32 v7, v8, v5, v7 2042; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 2043; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2044; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 2045; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 2046; CI-NEXT: v_trunc_f32_e32 v3, v3 2047; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 2048; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2049; CI-NEXT: v_or_b32_e32 v0, v2, v0 2050; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2051; CI-NEXT: s_endpgm 2052; 2053; VI-LABEL: frem_v4f16: 2054; VI: ; %bb.0: 2055; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2056; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2057; VI-NEXT: s_waitcnt lgkmcnt(0) 2058; VI-NEXT: v_mov_b32_e32 v2, s6 2059; VI-NEXT: s_add_u32 s0, s0, 32 2060; VI-NEXT: s_addc_u32 s1, s1, 0 2061; VI-NEXT: v_mov_b32_e32 v5, s1 2062; VI-NEXT: v_mov_b32_e32 v4, s0 2063; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 2064; VI-NEXT: v_mov_b32_e32 v3, s7 2065; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2066; VI-NEXT: v_mov_b32_e32 v0, s4 2067; VI-NEXT: v_mov_b32_e32 v1, s5 2068; VI-NEXT: s_waitcnt vmcnt(1) 2069; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 2070; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 2071; VI-NEXT: s_waitcnt vmcnt(0) 2072; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 2073; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 2074; VI-NEXT: v_rcp_f32_e32 v9, v9 2075; VI-NEXT: v_mul_f32_e32 v7, v7, v9 2076; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 2077; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 2078; VI-NEXT: v_trunc_f16_e32 v7, v7 2079; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 2080; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 2081; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 2082; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2083; VI-NEXT: v_rcp_f32_e32 v8, v8 2084; VI-NEXT: v_mul_f32_e32 v7, v7, v8 2085; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 2086; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 2087; VI-NEXT: v_trunc_f16_e32 v7, v7 2088; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 2089; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2090; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 2091; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2092; VI-NEXT: v_or_b32_e32 v3, v3, v6 2093; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 2094; VI-NEXT: v_rcp_f32_e32 v8, v8 2095; VI-NEXT: v_mul_f32_e32 v6, v6, v8 2096; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2097; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 2098; VI-NEXT: v_trunc_f16_e32 v6, v6 2099; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 2100; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 2101; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 2102; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2103; VI-NEXT: v_rcp_f32_e32 v7, v7 2104; VI-NEXT: v_mul_f32_e32 v6, v6, v7 2105; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2106; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 2107; VI-NEXT: v_trunc_f16_e32 v6, v6 2108; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 2109; VI-NEXT: v_or_b32_e32 v2, v2, v5 2110; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2111; VI-NEXT: s_endpgm 2112; 2113; GFX9-LABEL: frem_v4f16: 2114; GFX9: ; %bb.0: 2115; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2116; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2117; GFX9-NEXT: v_mov_b32_e32 v4, 0 2118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2120; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2121; GFX9-NEXT: s_waitcnt vmcnt(1) 2122; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 2123; GFX9-NEXT: s_waitcnt vmcnt(0) 2124; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 2125; GFX9-NEXT: v_rcp_f32_e32 v6, v6 2126; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 2127; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 2128; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2129; GFX9-NEXT: v_trunc_f16_e32 v5, v5 2130; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 2131; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2132; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 2133; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2134; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 2135; GFX9-NEXT: v_rcp_f32_e32 v7, v7 2136; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 2137; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 2138; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 2139; GFX9-NEXT: v_trunc_f16_e32 v6, v6 2140; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 2141; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 2142; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 2143; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 2144; GFX9-NEXT: v_rcp_f32_e32 v5, v5 2145; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 2146; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 2147; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2148; GFX9-NEXT: v_trunc_f16_e32 v3, v3 2149; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 2150; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2151; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 2152; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2153; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 2154; GFX9-NEXT: v_rcp_f32_e32 v6, v6 2155; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 2156; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 2157; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 2158; GFX9-NEXT: v_trunc_f16_e32 v5, v5 2159; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 2160; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 2161; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2162; GFX9-NEXT: s_endpgm 2163; 2164; GFX10-LABEL: frem_v4f16: 2165; GFX10: ; %bb.0: 2166; GFX10-NEXT: s_clause 0x1 2167; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2168; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2169; GFX10-NEXT: v_mov_b32_e32 v4, 0 2170; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2171; GFX10-NEXT: s_clause 0x1 2172; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2173; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2174; GFX10-NEXT: s_waitcnt vmcnt(1) 2175; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 2176; GFX10-NEXT: s_waitcnt vmcnt(0) 2177; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 2178; GFX10-NEXT: v_rcp_f32_e32 v6, v6 2179; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 2180; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 2181; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2182; GFX10-NEXT: v_trunc_f16_e32 v5, v5 2183; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 2184; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2185; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2186; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 2187; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 2188; GFX10-NEXT: v_rcp_f32_e32 v7, v7 2189; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 2190; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 2191; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 2192; GFX10-NEXT: v_trunc_f16_e32 v6, v6 2193; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 2194; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 2195; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 2196; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 2197; GFX10-NEXT: v_rcp_f32_e32 v5, v5 2198; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 2199; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 2200; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2201; GFX10-NEXT: v_trunc_f16_e32 v3, v3 2202; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 2203; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2204; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2205; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 2206; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 2207; GFX10-NEXT: v_rcp_f32_e32 v6, v6 2208; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 2209; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 2210; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 2211; GFX10-NEXT: v_trunc_f16_e32 v5, v5 2212; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 2213; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 2214; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2215; GFX10-NEXT: s_endpgm 2216; 2217; GFX11-LABEL: frem_v4f16: 2218; GFX11: ; %bb.0: 2219; GFX11-NEXT: s_clause 0x1 2220; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2221; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2222; GFX11-NEXT: v_mov_b32_e32 v4, 0 2223; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX11-NEXT: s_clause 0x1 2225; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] 2226; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 2227; GFX11-NEXT: s_waitcnt vmcnt(1) 2228; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 2229; GFX11-NEXT: s_waitcnt vmcnt(0) 2230; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 2231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2232; GFX11-NEXT: v_rcp_f32_e32 v6, v6 2233; GFX11-NEXT: s_waitcnt_depctr 0xfff 2234; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 2235; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 2236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2237; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2238; GFX11-NEXT: v_trunc_f16_e32 v5, v5 2239; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2240; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 2241; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2242; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2243; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v3 2244; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2245; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 2246; GFX11-NEXT: v_rcp_f32_e32 v7, v7 2247; GFX11-NEXT: s_waitcnt_depctr 0xfff 2248; GFX11-NEXT: v_mul_f32_e32 v6, v6, v7 2249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2250; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 2251; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1 2252; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2253; GFX11-NEXT: v_trunc_f16_e32 v6, v6 2254; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1 2255; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 2256; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2257; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 2258; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 2259; GFX11-NEXT: v_rcp_f32_e32 v5, v5 2260; GFX11-NEXT: s_waitcnt_depctr 0xfff 2261; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 2262; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2263; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2264; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2265; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2266; GFX11-NEXT: v_trunc_f16_e32 v3, v3 2267; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0 2268; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2269; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2270; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2271; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v2 2272; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 2273; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2274; GFX11-NEXT: v_rcp_f32_e32 v6, v6 2275; GFX11-NEXT: s_waitcnt_depctr 0xfff 2276; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 2277; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 2278; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2279; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0 2280; GFX11-NEXT: v_trunc_f16_e32 v5, v5 2281; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2282; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0 2283; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 2284; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] 2285; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2286; GFX11-NEXT: s_endpgm 2287 <4 x half> addrspace(1)* %in2) #0 { 2288 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 2289 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 2290 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16 2291 %r2 = frem <4 x half> %r0, %r1 2292 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16 2293 ret void 2294} 2295 2296define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, 2297; SI-LABEL: frem_v2f32: 2298; SI: ; %bb.0: 2299; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2300; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2301; SI-NEXT: s_mov_b32 s3, 0xf000 2302; SI-NEXT: s_mov_b32 s2, -1 2303; SI-NEXT: s_waitcnt lgkmcnt(0) 2304; SI-NEXT: s_mov_b32 s0, s4 2305; SI-NEXT: s_mov_b32 s1, s5 2306; SI-NEXT: s_mov_b32 s4, s6 2307; SI-NEXT: s_mov_b32 s5, s7 2308; SI-NEXT: s_mov_b32 s6, s2 2309; SI-NEXT: s_mov_b32 s7, s3 2310; SI-NEXT: s_mov_b32 s10, s2 2311; SI-NEXT: s_mov_b32 s11, s3 2312; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2313; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 2314; SI-NEXT: s_waitcnt vmcnt(0) 2315; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2316; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2317; SI-NEXT: v_rcp_f32_e32 v6, v5 2318; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2319; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2320; SI-NEXT: v_fma_f32 v6, v7, v6, v6 2321; SI-NEXT: v_mul_f32_e32 v7, v4, v6 2322; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 2323; SI-NEXT: v_fma_f32 v7, v8, v6, v7 2324; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 2325; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2326; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2327; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2328; SI-NEXT: v_trunc_f32_e32 v4, v4 2329; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 2330; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2331; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2332; SI-NEXT: v_rcp_f32_e32 v5, v4 2333; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2334; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2335; SI-NEXT: v_fma_f32 v5, v6, v5, v5 2336; SI-NEXT: v_mul_f32_e32 v6, v3, v5 2337; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 2338; SI-NEXT: v_fma_f32 v6, v7, v5, v6 2339; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 2340; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2341; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 2342; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2343; SI-NEXT: v_trunc_f32_e32 v3, v3 2344; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 2345; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2346; SI-NEXT: s_endpgm 2347; 2348; CI-LABEL: frem_v2f32: 2349; CI: ; %bb.0: 2350; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2351; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2352; CI-NEXT: s_mov_b32 s3, 0xf000 2353; CI-NEXT: s_mov_b32 s2, -1 2354; CI-NEXT: s_mov_b32 s10, s2 2355; CI-NEXT: s_waitcnt lgkmcnt(0) 2356; CI-NEXT: s_mov_b32 s0, s4 2357; CI-NEXT: s_mov_b32 s1, s5 2358; CI-NEXT: s_mov_b32 s4, s6 2359; CI-NEXT: s_mov_b32 s5, s7 2360; CI-NEXT: s_mov_b32 s6, s2 2361; CI-NEXT: s_mov_b32 s7, s3 2362; CI-NEXT: s_mov_b32 s11, s3 2363; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2364; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 2365; CI-NEXT: s_waitcnt vmcnt(0) 2366; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2367; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2368; CI-NEXT: v_rcp_f32_e32 v6, v5 2369; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2370; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2371; CI-NEXT: v_fma_f32 v6, v7, v6, v6 2372; CI-NEXT: v_mul_f32_e32 v7, v4, v6 2373; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 2374; CI-NEXT: v_fma_f32 v7, v8, v6, v7 2375; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2376; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2377; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2378; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2379; CI-NEXT: v_trunc_f32_e32 v4, v4 2380; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 2381; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2382; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2383; CI-NEXT: v_rcp_f32_e32 v5, v4 2384; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2385; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2386; CI-NEXT: v_fma_f32 v5, v6, v5, v5 2387; CI-NEXT: v_mul_f32_e32 v6, v3, v5 2388; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 2389; CI-NEXT: v_fma_f32 v6, v7, v5, v6 2390; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 2391; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2392; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 2393; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2394; CI-NEXT: v_trunc_f32_e32 v3, v3 2395; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 2396; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2397; CI-NEXT: s_endpgm 2398; 2399; VI-LABEL: frem_v2f32: 2400; VI: ; %bb.0: 2401; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2402; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2403; VI-NEXT: s_waitcnt lgkmcnt(0) 2404; VI-NEXT: v_mov_b32_e32 v2, s6 2405; VI-NEXT: s_add_u32 s0, s0, 32 2406; VI-NEXT: s_addc_u32 s1, s1, 0 2407; VI-NEXT: v_mov_b32_e32 v5, s1 2408; VI-NEXT: v_mov_b32_e32 v3, s7 2409; VI-NEXT: v_mov_b32_e32 v4, s0 2410; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2411; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 2412; VI-NEXT: v_mov_b32_e32 v0, s4 2413; VI-NEXT: v_mov_b32_e32 v1, s5 2414; VI-NEXT: s_waitcnt vmcnt(0) 2415; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 2416; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 2417; VI-NEXT: v_rcp_f32_e32 v8, v7 2418; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2419; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2420; VI-NEXT: v_fma_f32 v8, v9, v8, v8 2421; VI-NEXT: v_mul_f32_e32 v9, v6, v8 2422; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 2423; VI-NEXT: v_fma_f32 v9, v10, v8, v9 2424; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 2425; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2426; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2427; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 2428; VI-NEXT: v_trunc_f32_e32 v6, v6 2429; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 2430; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 2431; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 2432; VI-NEXT: v_rcp_f32_e32 v7, v6 2433; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2434; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2435; VI-NEXT: v_fma_f32 v7, v8, v7, v7 2436; VI-NEXT: v_mul_f32_e32 v8, v5, v7 2437; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 2438; VI-NEXT: v_fma_f32 v8, v9, v7, v8 2439; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 2440; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2441; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2442; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 2443; VI-NEXT: v_trunc_f32_e32 v5, v5 2444; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 2445; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2446; VI-NEXT: s_endpgm 2447; 2448; GFX9-LABEL: frem_v2f32: 2449; GFX9: ; %bb.0: 2450; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2451; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2452; GFX9-NEXT: v_mov_b32_e32 v4, 0 2453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2454; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2455; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2456; GFX9-NEXT: s_waitcnt vmcnt(0) 2457; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 2458; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 2459; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2460; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2461; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2462; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 2463; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 2464; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 2465; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 2466; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 2467; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2468; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2469; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2470; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2471; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 2472; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 2473; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2474; GFX9-NEXT: v_rcp_f32_e32 v6, v5 2475; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2476; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2477; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 2478; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 2479; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 2480; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 2481; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 2482; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2483; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2484; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2485; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2486; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 2487; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2488; GFX9-NEXT: s_endpgm 2489; 2490; GFX10-LABEL: frem_v2f32: 2491; GFX10: ; %bb.0: 2492; GFX10-NEXT: s_clause 0x1 2493; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2494; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2495; GFX10-NEXT: v_mov_b32_e32 v4, 0 2496; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX10-NEXT: s_clause 0x1 2498; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2499; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2500; GFX10-NEXT: s_waitcnt vmcnt(0) 2501; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 2502; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 2503; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2504; GFX10-NEXT: s_denorm_mode 15 2505; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2506; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 2507; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 2508; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 2509; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 2510; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 2511; GFX10-NEXT: s_denorm_mode 12 2512; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2513; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2514; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2515; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 2516; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 2517; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 2518; GFX10-NEXT: v_rcp_f32_e32 v6, v5 2519; GFX10-NEXT: s_denorm_mode 15 2520; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2521; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 2522; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 2523; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 2524; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 2525; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 2526; GFX10-NEXT: s_denorm_mode 12 2527; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2528; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2529; GFX10-NEXT: v_trunc_f32_e32 v3, v3 2530; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 2531; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2532; GFX10-NEXT: s_endpgm 2533; 2534; GFX11-LABEL: frem_v2f32: 2535; GFX11: ; %bb.0: 2536; GFX11-NEXT: s_clause 0x1 2537; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2538; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2539; GFX11-NEXT: v_mov_b32_e32 v4, 0 2540; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2541; GFX11-NEXT: s_clause 0x1 2542; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] 2543; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 2544; GFX11-NEXT: s_waitcnt vmcnt(0) 2545; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 2546; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 2547; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 2548; GFX11-NEXT: v_rcp_f32_e32 v7, v6 2549; GFX11-NEXT: s_denorm_mode 15 2550; GFX11-NEXT: s_waitcnt_depctr 0xfff 2551; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2552; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7 2553; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2554; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7 2555; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5 2556; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2557; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7 2558; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5 2559; GFX11-NEXT: s_denorm_mode 12 2560; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2561; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2562; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2564; GFX11-NEXT: v_trunc_f32_e32 v5, v5 2565; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 2566; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 2567; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 2568; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 2569; GFX11-NEXT: v_rcp_f32_e32 v6, v5 2570; GFX11-NEXT: s_denorm_mode 15 2571; GFX11-NEXT: s_waitcnt_depctr 0xfff 2572; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2573; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6 2574; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2575; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6 2576; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3 2577; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2578; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6 2579; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3 2580; GFX11-NEXT: s_denorm_mode 12 2581; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2582; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2583; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2584; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2585; GFX11-NEXT: v_trunc_f32_e32 v3, v3 2586; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 2587; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] 2588; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2589; GFX11-NEXT: s_endpgm 2590 <2 x float> addrspace(1)* %in2) #0 { 2591 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 2592 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 2593 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 2594 %r2 = frem <2 x float> %r0, %r1 2595 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 2596 ret void 2597} 2598 2599define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, 2600; SI-LABEL: frem_v4f32: 2601; SI: ; %bb.0: 2602; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2603; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2604; SI-NEXT: s_mov_b32 s3, 0xf000 2605; SI-NEXT: s_mov_b32 s2, -1 2606; SI-NEXT: s_waitcnt lgkmcnt(0) 2607; SI-NEXT: s_mov_b32 s0, s4 2608; SI-NEXT: s_mov_b32 s1, s5 2609; SI-NEXT: s_mov_b32 s4, s6 2610; SI-NEXT: s_mov_b32 s5, s7 2611; SI-NEXT: s_mov_b32 s6, s2 2612; SI-NEXT: s_mov_b32 s7, s3 2613; SI-NEXT: s_mov_b32 s10, s2 2614; SI-NEXT: s_mov_b32 s11, s3 2615; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2616; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2617; SI-NEXT: s_waitcnt vmcnt(0) 2618; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2619; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2620; SI-NEXT: v_rcp_f32_e32 v10, v9 2621; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2622; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2623; SI-NEXT: v_fma_f32 v10, v11, v10, v10 2624; SI-NEXT: v_mul_f32_e32 v11, v8, v10 2625; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 2626; SI-NEXT: v_fma_f32 v11, v12, v10, v11 2627; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 2628; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2629; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2630; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2631; SI-NEXT: v_trunc_f32_e32 v8, v8 2632; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 2633; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2634; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2635; SI-NEXT: v_rcp_f32_e32 v9, v8 2636; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2637; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2638; SI-NEXT: v_fma_f32 v9, v10, v9, v9 2639; SI-NEXT: v_mul_f32_e32 v10, v7, v9 2640; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 2641; SI-NEXT: v_fma_f32 v10, v11, v9, v10 2642; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 2643; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2644; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2645; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2646; SI-NEXT: v_trunc_f32_e32 v7, v7 2647; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 2648; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2649; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2650; SI-NEXT: v_rcp_f32_e32 v8, v7 2651; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2652; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2653; SI-NEXT: v_fma_f32 v8, v9, v8, v8 2654; SI-NEXT: v_mul_f32_e32 v9, v6, v8 2655; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 2656; SI-NEXT: v_fma_f32 v9, v10, v8, v9 2657; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 2658; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2659; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2660; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2661; SI-NEXT: v_trunc_f32_e32 v6, v6 2662; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 2663; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2664; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2665; SI-NEXT: v_rcp_f32_e32 v7, v6 2666; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2667; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2668; SI-NEXT: v_fma_f32 v7, v8, v7, v7 2669; SI-NEXT: v_mul_f32_e32 v8, v5, v7 2670; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 2671; SI-NEXT: v_fma_f32 v8, v9, v7, v8 2672; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 2673; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2674; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2675; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2676; SI-NEXT: v_trunc_f32_e32 v5, v5 2677; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 2678; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2679; SI-NEXT: s_endpgm 2680; 2681; CI-LABEL: frem_v4f32: 2682; CI: ; %bb.0: 2683; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2684; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2685; CI-NEXT: s_mov_b32 s3, 0xf000 2686; CI-NEXT: s_mov_b32 s2, -1 2687; CI-NEXT: s_mov_b32 s10, s2 2688; CI-NEXT: s_waitcnt lgkmcnt(0) 2689; CI-NEXT: s_mov_b32 s0, s4 2690; CI-NEXT: s_mov_b32 s1, s5 2691; CI-NEXT: s_mov_b32 s4, s6 2692; CI-NEXT: s_mov_b32 s5, s7 2693; CI-NEXT: s_mov_b32 s6, s2 2694; CI-NEXT: s_mov_b32 s7, s3 2695; CI-NEXT: s_mov_b32 s11, s3 2696; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2697; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2698; CI-NEXT: s_waitcnt vmcnt(0) 2699; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2700; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2701; CI-NEXT: v_rcp_f32_e32 v10, v9 2702; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2703; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2704; CI-NEXT: v_fma_f32 v10, v11, v10, v10 2705; CI-NEXT: v_mul_f32_e32 v11, v8, v10 2706; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 2707; CI-NEXT: v_fma_f32 v11, v12, v10, v11 2708; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 2709; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2710; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2711; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2712; CI-NEXT: v_trunc_f32_e32 v8, v8 2713; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 2714; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2715; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2716; CI-NEXT: v_rcp_f32_e32 v9, v8 2717; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2718; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2719; CI-NEXT: v_fma_f32 v9, v10, v9, v9 2720; CI-NEXT: v_mul_f32_e32 v10, v7, v9 2721; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 2722; CI-NEXT: v_fma_f32 v10, v11, v9, v10 2723; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 2724; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2725; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2726; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2727; CI-NEXT: v_trunc_f32_e32 v7, v7 2728; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 2729; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2730; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2731; CI-NEXT: v_rcp_f32_e32 v8, v7 2732; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2733; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2734; CI-NEXT: v_fma_f32 v8, v9, v8, v8 2735; CI-NEXT: v_mul_f32_e32 v9, v6, v8 2736; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 2737; CI-NEXT: v_fma_f32 v9, v10, v8, v9 2738; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 2739; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2740; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2741; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2742; CI-NEXT: v_trunc_f32_e32 v6, v6 2743; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 2744; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2745; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2746; CI-NEXT: v_rcp_f32_e32 v7, v6 2747; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2748; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2749; CI-NEXT: v_fma_f32 v7, v8, v7, v7 2750; CI-NEXT: v_mul_f32_e32 v8, v5, v7 2751; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 2752; CI-NEXT: v_fma_f32 v8, v9, v7, v8 2753; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 2754; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2755; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2756; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2757; CI-NEXT: v_trunc_f32_e32 v5, v5 2758; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 2759; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2760; CI-NEXT: s_endpgm 2761; 2762; VI-LABEL: frem_v4f32: 2763; VI: ; %bb.0: 2764; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2765; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2766; VI-NEXT: s_waitcnt lgkmcnt(0) 2767; VI-NEXT: v_mov_b32_e32 v0, s6 2768; VI-NEXT: s_add_u32 s0, s0, 64 2769; VI-NEXT: s_addc_u32 s1, s1, 0 2770; VI-NEXT: v_mov_b32_e32 v5, s1 2771; VI-NEXT: v_mov_b32_e32 v1, s7 2772; VI-NEXT: v_mov_b32_e32 v4, s0 2773; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2774; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2775; VI-NEXT: v_mov_b32_e32 v8, s4 2776; VI-NEXT: v_mov_b32_e32 v9, s5 2777; VI-NEXT: s_waitcnt vmcnt(0) 2778; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 2779; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 2780; VI-NEXT: v_rcp_f32_e32 v12, v11 2781; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2782; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 2783; VI-NEXT: v_fma_f32 v12, v13, v12, v12 2784; VI-NEXT: v_mul_f32_e32 v13, v10, v12 2785; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 2786; VI-NEXT: v_fma_f32 v13, v14, v12, v13 2787; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 2788; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2789; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 2790; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 2791; VI-NEXT: v_trunc_f32_e32 v10, v10 2792; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 2793; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 2794; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2795; VI-NEXT: v_rcp_f32_e32 v11, v10 2796; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2797; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2798; VI-NEXT: v_fma_f32 v11, v12, v11, v11 2799; VI-NEXT: v_mul_f32_e32 v12, v7, v11 2800; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 2801; VI-NEXT: v_fma_f32 v12, v13, v11, v12 2802; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 2803; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2804; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 2805; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2806; VI-NEXT: v_trunc_f32_e32 v7, v7 2807; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 2808; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2809; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2810; VI-NEXT: v_rcp_f32_e32 v10, v7 2811; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2812; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 2813; VI-NEXT: v_fma_f32 v10, v11, v10, v10 2814; VI-NEXT: v_mul_f32_e32 v11, v6, v10 2815; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 2816; VI-NEXT: v_fma_f32 v11, v12, v10, v11 2817; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 2818; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2819; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 2820; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2821; VI-NEXT: v_trunc_f32_e32 v6, v6 2822; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 2823; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2824; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2825; VI-NEXT: v_rcp_f32_e32 v7, v6 2826; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2827; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 2828; VI-NEXT: v_fma_f32 v7, v10, v7, v7 2829; VI-NEXT: v_mul_f32_e32 v10, v5, v7 2830; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 2831; VI-NEXT: v_fma_f32 v10, v11, v7, v10 2832; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 2833; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2834; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 2835; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2836; VI-NEXT: v_trunc_f32_e32 v5, v5 2837; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 2838; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2839; VI-NEXT: s_endpgm 2840; 2841; GFX9-LABEL: frem_v4f32: 2842; GFX9: ; %bb.0: 2843; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2844; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2845; GFX9-NEXT: v_mov_b32_e32 v8, 0 2846; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2847; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2848; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2849; GFX9-NEXT: s_waitcnt vmcnt(0) 2850; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 2851; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 2852; GFX9-NEXT: v_rcp_f32_e32 v11, v10 2853; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2854; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2855; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 2856; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 2857; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 2858; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 2859; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 2860; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2861; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2862; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2863; GFX9-NEXT: v_trunc_f32_e32 v9, v9 2864; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 2865; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 2866; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2867; GFX9-NEXT: v_rcp_f32_e32 v10, v9 2868; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2869; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2870; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 2871; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 2872; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 2873; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 2874; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 2875; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2876; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2877; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2878; GFX9-NEXT: v_trunc_f32_e32 v7, v7 2879; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 2880; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2881; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2882; GFX9-NEXT: v_rcp_f32_e32 v9, v7 2883; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2884; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2885; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 2886; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 2887; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 2888; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 2889; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 2890; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2891; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2892; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2893; GFX9-NEXT: v_trunc_f32_e32 v6, v6 2894; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 2895; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2896; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2897; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2898; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2899; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2900; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 2901; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 2902; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 2903; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 2904; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 2905; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2906; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2907; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2908; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2909; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 2910; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2911; GFX9-NEXT: s_endpgm 2912; 2913; GFX10-LABEL: frem_v4f32: 2914; GFX10: ; %bb.0: 2915; GFX10-NEXT: s_clause 0x1 2916; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2917; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2918; GFX10-NEXT: v_mov_b32_e32 v8, 0 2919; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2920; GFX10-NEXT: s_clause 0x1 2921; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2922; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2923; GFX10-NEXT: s_waitcnt vmcnt(0) 2924; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 2925; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 2926; GFX10-NEXT: v_rcp_f32_e32 v11, v10 2927; GFX10-NEXT: s_denorm_mode 15 2928; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2929; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 2930; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 2931; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 2932; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 2933; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 2934; GFX10-NEXT: s_denorm_mode 12 2935; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2936; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2937; GFX10-NEXT: v_trunc_f32_e32 v9, v9 2938; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 2939; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 2940; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 2941; GFX10-NEXT: v_rcp_f32_e32 v10, v9 2942; GFX10-NEXT: s_denorm_mode 15 2943; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2944; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 2945; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 2946; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 2947; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 2948; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 2949; GFX10-NEXT: s_denorm_mode 12 2950; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2951; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2952; GFX10-NEXT: v_trunc_f32_e32 v7, v7 2953; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 2954; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 2955; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 2956; GFX10-NEXT: v_rcp_f32_e32 v9, v7 2957; GFX10-NEXT: s_denorm_mode 15 2958; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2959; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 2960; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 2961; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 2962; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 2963; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 2964; GFX10-NEXT: s_denorm_mode 12 2965; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2966; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2967; GFX10-NEXT: v_trunc_f32_e32 v6, v6 2968; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 2969; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 2970; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 2971; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2972; GFX10-NEXT: s_denorm_mode 15 2973; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2974; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 2975; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 2976; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 2977; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 2978; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 2979; GFX10-NEXT: s_denorm_mode 12 2980; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2981; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2982; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2983; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 2984; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2985; GFX10-NEXT: s_endpgm 2986; 2987; GFX11-LABEL: frem_v4f32: 2988; GFX11: ; %bb.0: 2989; GFX11-NEXT: s_clause 0x1 2990; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2991; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2992; GFX11-NEXT: v_mov_b32_e32 v8, 0 2993; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2994; GFX11-NEXT: s_clause 0x1 2995; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] 2996; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 2997; GFX11-NEXT: s_waitcnt vmcnt(0) 2998; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 2999; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 3000; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3001; GFX11-NEXT: v_rcp_f32_e32 v11, v10 3002; GFX11-NEXT: s_denorm_mode 15 3003; GFX11-NEXT: s_waitcnt_depctr 0xfff 3004; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3005; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 3006; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3007; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11 3008; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9 3009; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3010; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 3011; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9 3012; GFX11-NEXT: s_denorm_mode 12 3013; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3014; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12 3015; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3 3016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3017; GFX11-NEXT: v_trunc_f32_e32 v9, v9 3018; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 3019; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 3020; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 3021; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3022; GFX11-NEXT: v_rcp_f32_e32 v10, v9 3023; GFX11-NEXT: s_denorm_mode 15 3024; GFX11-NEXT: s_waitcnt_depctr 0xfff 3025; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3026; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 3027; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3028; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 3029; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 3030; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3031; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 3032; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 3033; GFX11-NEXT: s_denorm_mode 12 3034; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3035; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 3036; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3037; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3038; GFX11-NEXT: v_trunc_f32_e32 v7, v7 3039; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 3040; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 3041; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 3042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3043; GFX11-NEXT: v_rcp_f32_e32 v9, v7 3044; GFX11-NEXT: s_denorm_mode 15 3045; GFX11-NEXT: s_waitcnt_depctr 0xfff 3046; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0 3047; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9 3048; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3049; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9 3050; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6 3051; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3052; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9 3053; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6 3054; GFX11-NEXT: s_denorm_mode 12 3055; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3056; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10 3057; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3058; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3059; GFX11-NEXT: v_trunc_f32_e32 v6, v6 3060; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 3061; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 3062; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 3063; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3064; GFX11-NEXT: v_rcp_f32_e32 v7, v6 3065; GFX11-NEXT: s_denorm_mode 15 3066; GFX11-NEXT: s_waitcnt_depctr 0xfff 3067; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0 3068; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7 3069; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3070; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7 3071; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5 3072; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3073; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7 3074; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5 3075; GFX11-NEXT: s_denorm_mode 12 3076; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3077; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9 3078; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3079; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3080; GFX11-NEXT: v_trunc_f32_e32 v5, v5 3081; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 3082; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] 3083; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3084; GFX11-NEXT: s_endpgm 3085 <4 x float> addrspace(1)* %in2) #0 { 3086 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 3087 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 3088 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 3089 %r2 = frem <4 x float> %r0, %r1 3090 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 3091 ret void 3092} 3093 3094define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, 3095; SI-LABEL: frem_v2f64: 3096; SI: ; %bb.0: 3097; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 3098; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3099; SI-NEXT: s_mov_b32 s7, 0xf000 3100; SI-NEXT: s_mov_b32 s6, -1 3101; SI-NEXT: s_waitcnt lgkmcnt(0) 3102; SI-NEXT: s_mov_b32 s4, s8 3103; SI-NEXT: s_mov_b32 s5, s9 3104; SI-NEXT: s_mov_b32 s8, s10 3105; SI-NEXT: s_mov_b32 s9, s11 3106; SI-NEXT: s_mov_b32 s10, s6 3107; SI-NEXT: s_mov_b32 s11, s7 3108; SI-NEXT: s_mov_b32 s2, s6 3109; SI-NEXT: s_mov_b32 s3, s7 3110; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 3111; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 3112; SI-NEXT: s_waitcnt vmcnt(0) 3113; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 3114; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3115; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3116; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3117; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3118; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3119; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] 3120; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3121; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] 3122; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 3123; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 3124; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 3125; SI-NEXT: s_nop 1 3126; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] 3127; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3128; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 3129; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 3130; SI-NEXT: s_mov_b32 s3, 0xfffff 3131; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 3132; SI-NEXT: v_not_b32_e32 v10, v10 3133; SI-NEXT: v_and_b32_e32 v10, v8, v10 3134; SI-NEXT: v_not_b32_e32 v11, v11 3135; SI-NEXT: v_and_b32_e32 v11, v9, v11 3136; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9 3137; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 3138; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc 3139; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 3140; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] 3141; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 3142; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] 3143; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3144; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 3145; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3146; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3147; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3148; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3149; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3150; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] 3151; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3152; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] 3153; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 3154; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 3155; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 3156; SI-NEXT: s_nop 1 3157; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] 3158; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3159; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 3160; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 3161; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 3162; SI-NEXT: v_not_b32_e32 v8, v8 3163; SI-NEXT: v_and_b32_e32 v8, v6, v8 3164; SI-NEXT: v_not_b32_e32 v9, v9 3165; SI-NEXT: v_and_b32_e32 v9, v7, v9 3166; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7 3167; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 3168; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 3169; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 3170; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] 3171; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 3172; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] 3173; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3174; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 3175; SI-NEXT: s_endpgm 3176; 3177; CI-LABEL: frem_v2f64: 3178; CI: ; %bb.0: 3179; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 3180; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 3181; CI-NEXT: s_mov_b32 s3, 0xf000 3182; CI-NEXT: s_mov_b32 s2, -1 3183; CI-NEXT: s_mov_b32 s10, s2 3184; CI-NEXT: s_waitcnt lgkmcnt(0) 3185; CI-NEXT: s_mov_b32 s0, s4 3186; CI-NEXT: s_mov_b32 s1, s5 3187; CI-NEXT: s_mov_b32 s4, s6 3188; CI-NEXT: s_mov_b32 s5, s7 3189; CI-NEXT: s_mov_b32 s6, s2 3190; CI-NEXT: s_mov_b32 s7, s3 3191; CI-NEXT: s_mov_b32 s11, s3 3192; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 3193; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 3194; CI-NEXT: s_waitcnt vmcnt(0) 3195; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] 3196; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3197; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3198; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3199; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3200; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3201; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 3202; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3203; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 3204; CI-NEXT: s_nop 1 3205; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 3206; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3207; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 3208; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3209; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] 3210; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3211; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3212; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3213; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3214; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3215; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 3216; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3217; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 3218; CI-NEXT: s_nop 1 3219; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 3220; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3221; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 3222; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3223; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3224; CI-NEXT: s_endpgm 3225; 3226; VI-LABEL: frem_v2f64: 3227; VI: ; %bb.0: 3228; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3229; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 3230; VI-NEXT: s_waitcnt lgkmcnt(0) 3231; VI-NEXT: v_mov_b32_e32 v0, s6 3232; VI-NEXT: s_add_u32 s0, s0, 64 3233; VI-NEXT: s_addc_u32 s1, s1, 0 3234; VI-NEXT: v_mov_b32_e32 v5, s1 3235; VI-NEXT: v_mov_b32_e32 v1, s7 3236; VI-NEXT: v_mov_b32_e32 v4, s0 3237; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3238; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 3239; VI-NEXT: v_mov_b32_e32 v8, s4 3240; VI-NEXT: v_mov_b32_e32 v9, s5 3241; VI-NEXT: s_waitcnt vmcnt(0) 3242; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] 3243; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] 3244; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 3245; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 3246; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 3247; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 3248; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] 3249; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] 3250; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] 3251; VI-NEXT: s_nop 1 3252; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] 3253; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] 3254; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] 3255; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] 3256; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 3257; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] 3258; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 3259; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3260; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 3261; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3262; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] 3263; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3264; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] 3265; VI-NEXT: s_nop 1 3266; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] 3267; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3268; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 3269; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3270; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3271; VI-NEXT: s_endpgm 3272; 3273; GFX9-LABEL: frem_v2f64: 3274; GFX9: ; %bb.0: 3275; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3276; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3277; GFX9-NEXT: v_mov_b32_e32 v16, 0 3278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3279; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 3280; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 3281; GFX9-NEXT: s_waitcnt vmcnt(0) 3282; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 3283; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3284; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3285; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3286; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3287; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3288; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 3289; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3290; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 3291; GFX9-NEXT: s_nop 1 3292; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 3293; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3294; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 3295; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3296; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 3297; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3298; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3299; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3300; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3301; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3302; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 3303; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3304; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 3305; GFX9-NEXT: s_nop 1 3306; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 3307; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3308; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 3309; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3310; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 3311; GFX9-NEXT: s_endpgm 3312; 3313; GFX10-LABEL: frem_v2f64: 3314; GFX10: ; %bb.0: 3315; GFX10-NEXT: s_clause 0x1 3316; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 3317; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 3318; GFX10-NEXT: v_mov_b32_e32 v16, 0 3319; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3320; GFX10-NEXT: s_clause 0x1 3321; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 3322; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 3323; GFX10-NEXT: s_waitcnt vmcnt(0) 3324; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] 3325; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3326; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3327; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3328; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3329; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3330; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 3331; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3332; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 3333; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 3334; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3335; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 3336; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3337; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] 3338; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3339; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3340; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3341; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3342; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3343; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 3344; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3345; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 3346; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 3347; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3348; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 3349; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3350; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 3351; GFX10-NEXT: s_endpgm 3352; 3353; GFX11-LABEL: frem_v2f64: 3354; GFX11: ; %bb.0: 3355; GFX11-NEXT: s_clause 0x1 3356; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 3357; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 3358; GFX11-NEXT: v_mov_b32_e32 v16, 0 3359; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3360; GFX11-NEXT: s_clause 0x1 3361; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7] 3362; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 3363; GFX11-NEXT: s_waitcnt vmcnt(0) 3364; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] 3365; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 3366; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3367; GFX11-NEXT: s_waitcnt_depctr 0xfff 3368; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3369; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3370; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3371; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3372; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3373; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 3374; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3375; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3376; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 3377; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3378; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 3379; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3380; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3381; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 3382; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3383; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] 3384; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 3385; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3386; GFX11-NEXT: s_waitcnt_depctr 0xfff 3387; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3388; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3389; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3390; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3391; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3392; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 3393; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3394; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3395; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 3396; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3397; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 3398; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3399; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3400; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 3401; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3402; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5] 3403; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3404; GFX11-NEXT: s_endpgm 3405 <2 x double> addrspace(1)* %in2) #0 { 3406 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 3407 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 3408 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 3409 %r2 = frem <2 x double> %r0, %r1 3410 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 3411 ret void 3412} 3413 3414attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3415attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3416