1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 7 8define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 9; SI-LABEL: frem_f16: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 12; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 13; SI-NEXT: s_mov_b32 s11, 0xf000 14; SI-NEXT: s_mov_b32 s10, -1 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b32 s8, s4 17; SI-NEXT: s_mov_b32 s9, s5 18; SI-NEXT: s_mov_b32 s4, s6 19; SI-NEXT: s_mov_b32 s5, s7 20; SI-NEXT: s_mov_b32 s6, s10 21; SI-NEXT: s_mov_b32 s7, s11 22; SI-NEXT: s_mov_b32 s2, s10 23; SI-NEXT: s_mov_b32 s3, s11 24; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 27; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 30; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 31; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 32; SI-NEXT: v_rcp_f32_e32 v4, v3 33; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 34; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 35; SI-NEXT: v_fma_f32 v4, v5, v4, v4 36; SI-NEXT: v_mul_f32_e32 v5, v2, v4 37; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 38; SI-NEXT: v_fma_f32 v5, v6, v4, v5 39; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 40; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 41; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 42; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 43; SI-NEXT: v_trunc_f32_e32 v2, v2 44; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 45; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 46; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 47; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 48; SI-NEXT: s_endpgm 49; 50; CI-LABEL: frem_f16: 51; CI: ; %bb.0: 52; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 53; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 54; CI-NEXT: s_mov_b32 s11, 0xf000 55; CI-NEXT: s_mov_b32 s10, -1 56; CI-NEXT: s_mov_b32 s2, s10 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: s_mov_b32 s8, s4 59; CI-NEXT: s_mov_b32 s9, s5 60; CI-NEXT: s_mov_b32 s4, s6 61; CI-NEXT: s_mov_b32 s5, s7 62; CI-NEXT: s_mov_b32 s6, s10 63; CI-NEXT: s_mov_b32 s7, s11 64; CI-NEXT: s_mov_b32 s3, s11 65; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 66; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 67; CI-NEXT: s_waitcnt vmcnt(1) 68; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 69; CI-NEXT: s_waitcnt vmcnt(0) 70; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 71; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 72; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 73; CI-NEXT: v_rcp_f32_e32 v4, v3 74; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 75; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 76; CI-NEXT: v_fma_f32 v4, v5, v4, v4 77; CI-NEXT: v_mul_f32_e32 v5, v2, v4 78; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 79; CI-NEXT: v_fma_f32 v5, v6, v4, v5 80; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 81; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 82; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 83; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 84; CI-NEXT: v_trunc_f32_e32 v2, v2 85; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 86; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 87; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 88; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 89; CI-NEXT: s_endpgm 90; 91; VI-LABEL: frem_f16: 92; VI: ; %bb.0: 93; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 94; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 95; VI-NEXT: s_waitcnt lgkmcnt(0) 96; VI-NEXT: v_mov_b32_e32 v2, s6 97; VI-NEXT: s_add_u32 s0, s0, 8 98; VI-NEXT: v_mov_b32_e32 v3, s7 99; VI-NEXT: s_addc_u32 s1, s1, 0 100; VI-NEXT: flat_load_ushort v4, v[2:3] 101; VI-NEXT: v_mov_b32_e32 v3, s1 102; VI-NEXT: v_mov_b32_e32 v2, s0 103; VI-NEXT: flat_load_ushort v2, v[2:3] 104; VI-NEXT: v_mov_b32_e32 v0, s4 105; VI-NEXT: v_mov_b32_e32 v1, s5 106; VI-NEXT: s_waitcnt vmcnt(1) 107; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 108; VI-NEXT: s_waitcnt vmcnt(0) 109; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 110; VI-NEXT: v_rcp_f32_e32 v5, v5 111; VI-NEXT: v_mul_f32_e32 v3, v3, v5 112; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 113; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 114; VI-NEXT: v_trunc_f16_e32 v3, v3 115; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 116; VI-NEXT: flat_store_short v[0:1], v2 117; VI-NEXT: s_endpgm 118; 119; GFX9-LABEL: frem_f16: 120; GFX9: ; %bb.0: 121; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 122; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 123; GFX9-NEXT: v_mov_b32_e32 v0, 0 124; GFX9-NEXT: s_waitcnt lgkmcnt(0) 125; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 126; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 127; GFX9-NEXT: s_waitcnt vmcnt(1) 128; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 129; GFX9-NEXT: s_waitcnt vmcnt(0) 130; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 131; GFX9-NEXT: v_rcp_f32_e32 v4, v4 132; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 133; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 134; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 135; GFX9-NEXT: v_trunc_f16_e32 v3, v3 136; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 137; GFX9-NEXT: global_store_short v0, v1, s[4:5] 138; GFX9-NEXT: s_endpgm 139; 140; GFX10-LABEL: frem_f16: 141; GFX10: ; %bb.0: 142; GFX10-NEXT: s_clause 0x1 143; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 144; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 145; GFX10-NEXT: v_mov_b32_e32 v0, 0 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: s_clause 0x1 148; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 149; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 150; GFX10-NEXT: s_waitcnt vmcnt(1) 151; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 152; GFX10-NEXT: s_waitcnt vmcnt(0) 153; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 154; GFX10-NEXT: v_rcp_f32_e32 v4, v4 155; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 156; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 157; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 158; GFX10-NEXT: v_trunc_f16_e32 v3, v3 159; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 160; GFX10-NEXT: global_store_short v0, v1, s[4:5] 161; GFX10-NEXT: s_endpgm 162 half addrspace(1)* %in2) #0 { 163 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 164 %r0 = load half, half addrspace(1)* %in1, align 4 165 %r1 = load half, half addrspace(1)* %gep2, align 4 166 %r2 = frem half %r0, %r1 167 store half %r2, half addrspace(1)* %out, align 4 168 ret void 169} 170 171define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 172; SI-LABEL: fast_frem_f16: 173; SI: ; %bb.0: 174; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 175; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 176; SI-NEXT: s_mov_b32 s11, 0xf000 177; SI-NEXT: s_mov_b32 s10, -1 178; SI-NEXT: s_waitcnt lgkmcnt(0) 179; SI-NEXT: s_mov_b32 s8, s4 180; SI-NEXT: s_mov_b32 s9, s5 181; SI-NEXT: s_mov_b32 s4, s6 182; SI-NEXT: s_mov_b32 s5, s7 183; SI-NEXT: s_mov_b32 s6, s10 184; SI-NEXT: s_mov_b32 s7, s11 185; SI-NEXT: s_mov_b32 s2, s10 186; SI-NEXT: s_mov_b32 s3, s11 187; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 188; SI-NEXT: s_waitcnt vmcnt(0) 189; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 190; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 191; SI-NEXT: s_waitcnt vmcnt(0) 192; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 193; SI-NEXT: v_rcp_f32_e32 v2, v1 194; SI-NEXT: v_mul_f32_e32 v2, v0, v2 195; SI-NEXT: v_trunc_f32_e32 v2, v2 196; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 197; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 198; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 199; SI-NEXT: s_endpgm 200; 201; CI-LABEL: fast_frem_f16: 202; CI: ; %bb.0: 203; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 204; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 205; CI-NEXT: s_mov_b32 s11, 0xf000 206; CI-NEXT: s_mov_b32 s10, -1 207; CI-NEXT: s_mov_b32 s2, s10 208; CI-NEXT: s_mov_b32 s3, s11 209; CI-NEXT: s_waitcnt lgkmcnt(0) 210; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 211; CI-NEXT: s_mov_b32 s8, s4 212; CI-NEXT: s_mov_b32 s9, s5 213; CI-NEXT: s_mov_b32 s4, s6 214; CI-NEXT: s_mov_b32 s5, s7 215; CI-NEXT: s_mov_b32 s6, s10 216; CI-NEXT: s_mov_b32 s7, s11 217; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 218; CI-NEXT: s_waitcnt vmcnt(1) 219; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 220; CI-NEXT: v_rcp_f32_e32 v2, v1 221; CI-NEXT: s_waitcnt vmcnt(0) 222; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 223; CI-NEXT: v_mul_f32_e32 v2, v0, v2 224; CI-NEXT: v_trunc_f32_e32 v2, v2 225; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 226; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 227; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 228; CI-NEXT: s_endpgm 229; 230; VI-LABEL: fast_frem_f16: 231; VI: ; %bb.0: 232; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 233; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 234; VI-NEXT: s_waitcnt lgkmcnt(0) 235; VI-NEXT: v_mov_b32_e32 v2, s6 236; VI-NEXT: s_add_u32 s0, s0, 8 237; VI-NEXT: v_mov_b32_e32 v3, s7 238; VI-NEXT: s_addc_u32 s1, s1, 0 239; VI-NEXT: flat_load_ushort v4, v[2:3] 240; VI-NEXT: v_mov_b32_e32 v3, s1 241; VI-NEXT: v_mov_b32_e32 v2, s0 242; VI-NEXT: flat_load_ushort v2, v[2:3] 243; VI-NEXT: v_mov_b32_e32 v0, s4 244; VI-NEXT: v_mov_b32_e32 v1, s5 245; VI-NEXT: s_waitcnt vmcnt(0) 246; VI-NEXT: v_rcp_f16_e32 v3, v2 247; VI-NEXT: v_mul_f16_e32 v3, v4, v3 248; VI-NEXT: v_trunc_f16_e32 v3, v3 249; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 250; VI-NEXT: flat_store_short v[0:1], v2 251; VI-NEXT: s_endpgm 252; 253; GFX9-LABEL: fast_frem_f16: 254; GFX9: ; %bb.0: 255; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 256; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 257; GFX9-NEXT: v_mov_b32_e32 v0, 0 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 260; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 261; GFX9-NEXT: s_waitcnt vmcnt(0) 262; GFX9-NEXT: v_rcp_f16_e32 v3, v2 263; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 264; GFX9-NEXT: v_trunc_f16_e32 v3, v3 265; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 266; GFX9-NEXT: global_store_short v0, v1, s[4:5] 267; GFX9-NEXT: s_endpgm 268; 269; GFX10-LABEL: fast_frem_f16: 270; GFX10: ; %bb.0: 271; GFX10-NEXT: s_clause 0x1 272; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 273; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 274; GFX10-NEXT: v_mov_b32_e32 v0, 0 275; GFX10-NEXT: s_waitcnt lgkmcnt(0) 276; GFX10-NEXT: s_clause 0x1 277; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 278; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 279; GFX10-NEXT: s_waitcnt vmcnt(0) 280; GFX10-NEXT: v_rcp_f16_e32 v3, v2 281; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 282; GFX10-NEXT: v_trunc_f16_e32 v3, v3 283; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 284; GFX10-NEXT: global_store_short v0, v1, s[4:5] 285; GFX10-NEXT: s_endpgm 286 half addrspace(1)* %in2) #0 { 287 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 288 %r0 = load half, half addrspace(1)* %in1, align 4 289 %r1 = load half, half addrspace(1)* %gep2, align 4 290 %r2 = frem fast half %r0, %r1 291 store half %r2, half addrspace(1)* %out, align 4 292 ret void 293} 294 295define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 296; SI-LABEL: unsafe_frem_f16: 297; SI: ; %bb.0: 298; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 300; SI-NEXT: s_mov_b32 s11, 0xf000 301; SI-NEXT: s_mov_b32 s10, -1 302; SI-NEXT: s_waitcnt lgkmcnt(0) 303; SI-NEXT: s_mov_b32 s8, s4 304; SI-NEXT: s_mov_b32 s9, s5 305; SI-NEXT: s_mov_b32 s4, s6 306; SI-NEXT: s_mov_b32 s5, s7 307; SI-NEXT: s_mov_b32 s6, s10 308; SI-NEXT: s_mov_b32 s7, s11 309; SI-NEXT: s_mov_b32 s2, s10 310; SI-NEXT: s_mov_b32 s3, s11 311; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 312; SI-NEXT: s_waitcnt vmcnt(0) 313; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 314; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 315; SI-NEXT: s_waitcnt vmcnt(0) 316; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 317; SI-NEXT: v_rcp_f32_e32 v2, v1 318; SI-NEXT: v_mul_f32_e32 v2, v0, v2 319; SI-NEXT: v_trunc_f32_e32 v2, v2 320; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 321; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 322; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 323; SI-NEXT: s_endpgm 324; 325; CI-LABEL: unsafe_frem_f16: 326; CI: ; %bb.0: 327; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 328; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 329; CI-NEXT: s_mov_b32 s11, 0xf000 330; CI-NEXT: s_mov_b32 s10, -1 331; CI-NEXT: s_mov_b32 s2, s10 332; CI-NEXT: s_mov_b32 s3, s11 333; CI-NEXT: s_waitcnt lgkmcnt(0) 334; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 335; CI-NEXT: s_mov_b32 s8, s4 336; CI-NEXT: s_mov_b32 s9, s5 337; CI-NEXT: s_mov_b32 s4, s6 338; CI-NEXT: s_mov_b32 s5, s7 339; CI-NEXT: s_mov_b32 s6, s10 340; CI-NEXT: s_mov_b32 s7, s11 341; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 342; CI-NEXT: s_waitcnt vmcnt(1) 343; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 344; CI-NEXT: v_rcp_f32_e32 v2, v1 345; CI-NEXT: s_waitcnt vmcnt(0) 346; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 347; CI-NEXT: v_mul_f32_e32 v2, v0, v2 348; CI-NEXT: v_trunc_f32_e32 v2, v2 349; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 350; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 351; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 352; CI-NEXT: s_endpgm 353; 354; VI-LABEL: unsafe_frem_f16: 355; VI: ; %bb.0: 356; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 357; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 358; VI-NEXT: s_waitcnt lgkmcnt(0) 359; VI-NEXT: v_mov_b32_e32 v2, s6 360; VI-NEXT: s_add_u32 s0, s0, 8 361; VI-NEXT: v_mov_b32_e32 v3, s7 362; VI-NEXT: s_addc_u32 s1, s1, 0 363; VI-NEXT: flat_load_ushort v4, v[2:3] 364; VI-NEXT: v_mov_b32_e32 v3, s1 365; VI-NEXT: v_mov_b32_e32 v2, s0 366; VI-NEXT: flat_load_ushort v2, v[2:3] 367; VI-NEXT: v_mov_b32_e32 v0, s4 368; VI-NEXT: v_mov_b32_e32 v1, s5 369; VI-NEXT: s_waitcnt vmcnt(0) 370; VI-NEXT: v_rcp_f16_e32 v3, v2 371; VI-NEXT: v_mul_f16_e32 v3, v4, v3 372; VI-NEXT: v_trunc_f16_e32 v3, v3 373; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 374; VI-NEXT: flat_store_short v[0:1], v2 375; VI-NEXT: s_endpgm 376; 377; GFX9-LABEL: unsafe_frem_f16: 378; GFX9: ; %bb.0: 379; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 380; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 381; GFX9-NEXT: v_mov_b32_e32 v0, 0 382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 383; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 384; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: v_rcp_f16_e32 v3, v2 387; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 388; GFX9-NEXT: v_trunc_f16_e32 v3, v3 389; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 390; GFX9-NEXT: global_store_short v0, v1, s[4:5] 391; GFX9-NEXT: s_endpgm 392; 393; GFX10-LABEL: unsafe_frem_f16: 394; GFX10: ; %bb.0: 395; GFX10-NEXT: s_clause 0x1 396; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 397; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 398; GFX10-NEXT: v_mov_b32_e32 v0, 0 399; GFX10-NEXT: s_waitcnt lgkmcnt(0) 400; GFX10-NEXT: s_clause 0x1 401; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 402; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 403; GFX10-NEXT: s_waitcnt vmcnt(0) 404; GFX10-NEXT: v_rcp_f16_e32 v3, v2 405; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 406; GFX10-NEXT: v_trunc_f16_e32 v3, v3 407; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 408; GFX10-NEXT: global_store_short v0, v1, s[4:5] 409; GFX10-NEXT: s_endpgm 410 half addrspace(1)* %in2) #1 { 411 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 412 %r0 = load half, half addrspace(1)* %in1, align 4 413 %r1 = load half, half addrspace(1)* %gep2, align 4 414 %r2 = frem afn half %r0, %r1 415 store half %r2, half addrspace(1)* %out, align 4 416 ret void 417} 418 419define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 420; SI-LABEL: frem_f32: 421; SI: ; %bb.0: 422; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 423; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 424; SI-NEXT: s_mov_b32 s11, 0xf000 425; SI-NEXT: s_mov_b32 s10, -1 426; SI-NEXT: s_waitcnt lgkmcnt(0) 427; SI-NEXT: s_mov_b32 s8, s4 428; SI-NEXT: s_mov_b32 s9, s5 429; SI-NEXT: s_mov_b32 s4, s6 430; SI-NEXT: s_mov_b32 s5, s7 431; SI-NEXT: s_mov_b32 s6, s10 432; SI-NEXT: s_mov_b32 s7, s11 433; SI-NEXT: s_mov_b32 s2, s10 434; SI-NEXT: s_mov_b32 s3, s11 435; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 436; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 437; SI-NEXT: s_waitcnt vmcnt(0) 438; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 439; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 440; SI-NEXT: v_rcp_f32_e32 v4, v3 441; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 442; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 443; SI-NEXT: v_fma_f32 v4, v5, v4, v4 444; SI-NEXT: v_mul_f32_e32 v5, v2, v4 445; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 446; SI-NEXT: v_fma_f32 v5, v6, v4, v5 447; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 448; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 449; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 450; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 451; SI-NEXT: v_trunc_f32_e32 v2, v2 452; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 453; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 454; SI-NEXT: s_endpgm 455; 456; CI-LABEL: frem_f32: 457; CI: ; %bb.0: 458; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 459; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 460; CI-NEXT: s_mov_b32 s11, 0xf000 461; CI-NEXT: s_mov_b32 s10, -1 462; CI-NEXT: s_mov_b32 s2, s10 463; CI-NEXT: s_waitcnt lgkmcnt(0) 464; CI-NEXT: s_mov_b32 s8, s4 465; CI-NEXT: s_mov_b32 s9, s5 466; CI-NEXT: s_mov_b32 s4, s6 467; CI-NEXT: s_mov_b32 s5, s7 468; CI-NEXT: s_mov_b32 s6, s10 469; CI-NEXT: s_mov_b32 s7, s11 470; CI-NEXT: s_mov_b32 s3, s11 471; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 472; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 473; CI-NEXT: s_waitcnt vmcnt(0) 474; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 475; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 476; CI-NEXT: v_rcp_f32_e32 v4, v3 477; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 478; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 479; CI-NEXT: v_fma_f32 v4, v5, v4, v4 480; CI-NEXT: v_mul_f32_e32 v5, v2, v4 481; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 482; CI-NEXT: v_fma_f32 v5, v6, v4, v5 483; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 484; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 485; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 486; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 487; CI-NEXT: v_trunc_f32_e32 v2, v2 488; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 489; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 490; CI-NEXT: s_endpgm 491; 492; VI-LABEL: frem_f32: 493; VI: ; %bb.0: 494; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 495; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 496; VI-NEXT: s_waitcnt lgkmcnt(0) 497; VI-NEXT: v_mov_b32_e32 v2, s6 498; VI-NEXT: s_add_u32 s0, s0, 16 499; VI-NEXT: v_mov_b32_e32 v3, s7 500; VI-NEXT: s_addc_u32 s1, s1, 0 501; VI-NEXT: flat_load_dword v4, v[2:3] 502; VI-NEXT: v_mov_b32_e32 v3, s1 503; VI-NEXT: v_mov_b32_e32 v2, s0 504; VI-NEXT: flat_load_dword v2, v[2:3] 505; VI-NEXT: v_mov_b32_e32 v0, s4 506; VI-NEXT: v_mov_b32_e32 v1, s5 507; VI-NEXT: s_waitcnt vmcnt(0) 508; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 509; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 510; VI-NEXT: v_rcp_f32_e32 v6, v5 511; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 512; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 513; VI-NEXT: v_fma_f32 v6, v7, v6, v6 514; VI-NEXT: v_mul_f32_e32 v7, v3, v6 515; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 516; VI-NEXT: v_fma_f32 v7, v8, v6, v7 517; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 518; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 519; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 520; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 521; VI-NEXT: v_trunc_f32_e32 v3, v3 522; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 523; VI-NEXT: flat_store_dword v[0:1], v2 524; VI-NEXT: s_endpgm 525; 526; GFX9-LABEL: frem_f32: 527; GFX9: ; %bb.0: 528; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 529; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 530; GFX9-NEXT: v_mov_b32_e32 v0, 0 531; GFX9-NEXT: s_waitcnt lgkmcnt(0) 532; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 533; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 534; GFX9-NEXT: s_waitcnt vmcnt(0) 535; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 536; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 537; GFX9-NEXT: v_rcp_f32_e32 v5, v4 538; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 539; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 540; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 541; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 542; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 543; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 544; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 545; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 546; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 547; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 548; GFX9-NEXT: v_trunc_f32_e32 v3, v3 549; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 550; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 551; GFX9-NEXT: s_endpgm 552; 553; GFX10-LABEL: frem_f32: 554; GFX10: ; %bb.0: 555; GFX10-NEXT: s_clause 0x1 556; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 557; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 558; GFX10-NEXT: v_mov_b32_e32 v0, 0 559; GFX10-NEXT: s_waitcnt lgkmcnt(0) 560; GFX10-NEXT: s_clause 0x1 561; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 562; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 563; GFX10-NEXT: s_waitcnt vmcnt(0) 564; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 565; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 566; GFX10-NEXT: v_rcp_f32_e32 v5, v4 567; GFX10-NEXT: s_denorm_mode 15 568; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 569; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 570; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 571; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 572; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 573; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 574; GFX10-NEXT: s_denorm_mode 12 575; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 576; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 577; GFX10-NEXT: v_trunc_f32_e32 v3, v3 578; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 579; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 580; GFX10-NEXT: s_endpgm 581 float addrspace(1)* %in2) #0 { 582 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 583 %r0 = load float, float addrspace(1)* %in1, align 4 584 %r1 = load float, float addrspace(1)* %gep2, align 4 585 %r2 = frem float %r0, %r1 586 store float %r2, float addrspace(1)* %out, align 4 587 ret void 588} 589 590define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 591; SI-LABEL: fast_frem_f32: 592; SI: ; %bb.0: 593; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 594; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 595; SI-NEXT: s_mov_b32 s11, 0xf000 596; SI-NEXT: s_mov_b32 s10, -1 597; SI-NEXT: s_waitcnt lgkmcnt(0) 598; SI-NEXT: s_mov_b32 s8, s4 599; SI-NEXT: s_mov_b32 s9, s5 600; SI-NEXT: s_mov_b32 s4, s6 601; SI-NEXT: s_mov_b32 s5, s7 602; SI-NEXT: s_mov_b32 s6, s10 603; SI-NEXT: s_mov_b32 s7, s11 604; SI-NEXT: s_mov_b32 s2, s10 605; SI-NEXT: s_mov_b32 s3, s11 606; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 607; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 608; SI-NEXT: s_waitcnt vmcnt(0) 609; SI-NEXT: v_rcp_f32_e32 v2, v1 610; SI-NEXT: v_mul_f32_e32 v2, v0, v2 611; SI-NEXT: v_trunc_f32_e32 v2, v2 612; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 613; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 614; SI-NEXT: s_endpgm 615; 616; CI-LABEL: fast_frem_f32: 617; CI: ; %bb.0: 618; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 619; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 620; CI-NEXT: s_mov_b32 s11, 0xf000 621; CI-NEXT: s_mov_b32 s10, -1 622; CI-NEXT: s_mov_b32 s2, s10 623; CI-NEXT: s_waitcnt lgkmcnt(0) 624; CI-NEXT: s_mov_b32 s8, s4 625; CI-NEXT: s_mov_b32 s9, s5 626; CI-NEXT: s_mov_b32 s4, s6 627; CI-NEXT: s_mov_b32 s5, s7 628; CI-NEXT: s_mov_b32 s6, s10 629; CI-NEXT: s_mov_b32 s7, s11 630; CI-NEXT: s_mov_b32 s3, s11 631; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 632; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 633; CI-NEXT: s_waitcnt vmcnt(0) 634; CI-NEXT: v_rcp_f32_e32 v2, v1 635; CI-NEXT: v_mul_f32_e32 v2, v0, v2 636; CI-NEXT: v_trunc_f32_e32 v2, v2 637; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 638; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 639; CI-NEXT: s_endpgm 640; 641; VI-LABEL: fast_frem_f32: 642; VI: ; %bb.0: 643; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 644; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 645; VI-NEXT: s_waitcnt lgkmcnt(0) 646; VI-NEXT: v_mov_b32_e32 v2, s6 647; VI-NEXT: s_add_u32 s0, s0, 16 648; VI-NEXT: v_mov_b32_e32 v3, s7 649; VI-NEXT: s_addc_u32 s1, s1, 0 650; VI-NEXT: flat_load_dword v4, v[2:3] 651; VI-NEXT: v_mov_b32_e32 v3, s1 652; VI-NEXT: v_mov_b32_e32 v2, s0 653; VI-NEXT: flat_load_dword v2, v[2:3] 654; VI-NEXT: v_mov_b32_e32 v0, s4 655; VI-NEXT: v_mov_b32_e32 v1, s5 656; VI-NEXT: s_waitcnt vmcnt(0) 657; VI-NEXT: v_rcp_f32_e32 v3, v2 658; VI-NEXT: v_mul_f32_e32 v3, v4, v3 659; VI-NEXT: v_trunc_f32_e32 v3, v3 660; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 661; VI-NEXT: flat_store_dword v[0:1], v2 662; VI-NEXT: s_endpgm 663; 664; GFX9-LABEL: fast_frem_f32: 665; GFX9: ; %bb.0: 666; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 667; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 668; GFX9-NEXT: v_mov_b32_e32 v0, 0 669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 670; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 671; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 672; GFX9-NEXT: s_waitcnt vmcnt(0) 673; GFX9-NEXT: v_rcp_f32_e32 v3, v2 674; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 675; GFX9-NEXT: v_trunc_f32_e32 v3, v3 676; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 677; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 678; GFX9-NEXT: s_endpgm 679; 680; GFX10-LABEL: fast_frem_f32: 681; GFX10: ; %bb.0: 682; GFX10-NEXT: s_clause 0x1 683; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 684; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 685; GFX10-NEXT: v_mov_b32_e32 v0, 0 686; GFX10-NEXT: s_waitcnt lgkmcnt(0) 687; GFX10-NEXT: s_clause 0x1 688; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 689; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 690; GFX10-NEXT: s_waitcnt vmcnt(0) 691; GFX10-NEXT: v_rcp_f32_e32 v3, v2 692; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 693; GFX10-NEXT: v_trunc_f32_e32 v3, v3 694; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 695; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 696; GFX10-NEXT: s_endpgm 697 float addrspace(1)* %in2) #0 { 698 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 699 %r0 = load float, float addrspace(1)* %in1, align 4 700 %r1 = load float, float addrspace(1)* %gep2, align 4 701 %r2 = frem fast float %r0, %r1 702 store float %r2, float addrspace(1)* %out, align 4 703 ret void 704} 705 706define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 707; SI-LABEL: unsafe_frem_f32: 708; SI: ; %bb.0: 709; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 710; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 711; SI-NEXT: s_mov_b32 s11, 0xf000 712; SI-NEXT: s_mov_b32 s10, -1 713; SI-NEXT: s_waitcnt lgkmcnt(0) 714; SI-NEXT: s_mov_b32 s8, s4 715; SI-NEXT: s_mov_b32 s9, s5 716; SI-NEXT: s_mov_b32 s4, s6 717; SI-NEXT: s_mov_b32 s5, s7 718; SI-NEXT: s_mov_b32 s6, s10 719; SI-NEXT: s_mov_b32 s7, s11 720; SI-NEXT: s_mov_b32 s2, s10 721; SI-NEXT: s_mov_b32 s3, s11 722; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 723; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 724; SI-NEXT: s_waitcnt vmcnt(0) 725; SI-NEXT: v_rcp_f32_e32 v2, v1 726; SI-NEXT: v_mul_f32_e32 v2, v0, v2 727; SI-NEXT: v_trunc_f32_e32 v2, v2 728; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 729; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 730; SI-NEXT: s_endpgm 731; 732; CI-LABEL: unsafe_frem_f32: 733; CI: ; %bb.0: 734; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 735; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 736; CI-NEXT: s_mov_b32 s11, 0xf000 737; CI-NEXT: s_mov_b32 s10, -1 738; CI-NEXT: s_mov_b32 s2, s10 739; CI-NEXT: s_waitcnt lgkmcnt(0) 740; CI-NEXT: s_mov_b32 s8, s4 741; CI-NEXT: s_mov_b32 s9, s5 742; CI-NEXT: s_mov_b32 s4, s6 743; CI-NEXT: s_mov_b32 s5, s7 744; CI-NEXT: s_mov_b32 s6, s10 745; CI-NEXT: s_mov_b32 s7, s11 746; CI-NEXT: s_mov_b32 s3, s11 747; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 748; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 749; CI-NEXT: s_waitcnt vmcnt(0) 750; CI-NEXT: v_rcp_f32_e32 v2, v1 751; CI-NEXT: v_mul_f32_e32 v2, v0, v2 752; CI-NEXT: v_trunc_f32_e32 v2, v2 753; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 754; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 755; CI-NEXT: s_endpgm 756; 757; VI-LABEL: unsafe_frem_f32: 758; VI: ; %bb.0: 759; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 760; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 761; VI-NEXT: s_waitcnt lgkmcnt(0) 762; VI-NEXT: v_mov_b32_e32 v2, s6 763; VI-NEXT: s_add_u32 s0, s0, 16 764; VI-NEXT: v_mov_b32_e32 v3, s7 765; VI-NEXT: s_addc_u32 s1, s1, 0 766; VI-NEXT: flat_load_dword v4, v[2:3] 767; VI-NEXT: v_mov_b32_e32 v3, s1 768; VI-NEXT: v_mov_b32_e32 v2, s0 769; VI-NEXT: flat_load_dword v2, v[2:3] 770; VI-NEXT: v_mov_b32_e32 v0, s4 771; VI-NEXT: v_mov_b32_e32 v1, s5 772; VI-NEXT: s_waitcnt vmcnt(0) 773; VI-NEXT: v_rcp_f32_e32 v3, v2 774; VI-NEXT: v_mul_f32_e32 v3, v4, v3 775; VI-NEXT: v_trunc_f32_e32 v3, v3 776; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 777; VI-NEXT: flat_store_dword v[0:1], v2 778; VI-NEXT: s_endpgm 779; 780; GFX9-LABEL: unsafe_frem_f32: 781; GFX9: ; %bb.0: 782; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 783; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 784; GFX9-NEXT: v_mov_b32_e32 v0, 0 785; GFX9-NEXT: s_waitcnt lgkmcnt(0) 786; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 787; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 788; GFX9-NEXT: s_waitcnt vmcnt(0) 789; GFX9-NEXT: v_rcp_f32_e32 v3, v2 790; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 791; GFX9-NEXT: v_trunc_f32_e32 v3, v3 792; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 793; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 794; GFX9-NEXT: s_endpgm 795; 796; GFX10-LABEL: unsafe_frem_f32: 797; GFX10: ; %bb.0: 798; GFX10-NEXT: s_clause 0x1 799; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 800; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 801; GFX10-NEXT: v_mov_b32_e32 v0, 0 802; GFX10-NEXT: s_waitcnt lgkmcnt(0) 803; GFX10-NEXT: s_clause 0x1 804; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 805; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 806; GFX10-NEXT: s_waitcnt vmcnt(0) 807; GFX10-NEXT: v_rcp_f32_e32 v3, v2 808; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 809; GFX10-NEXT: v_trunc_f32_e32 v3, v3 810; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 811; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 812; GFX10-NEXT: s_endpgm 813 float addrspace(1)* %in2) #1 { 814 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 815 %r0 = load float, float addrspace(1)* %in1, align 4 816 %r1 = load float, float addrspace(1)* %gep2, align 4 817 %r2 = frem afn float %r0, %r1 818 store float %r2, float addrspace(1)* %out, align 4 819 ret void 820} 821 822define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 823; SI-LABEL: frem_f64: 824; SI: ; %bb.0: 825; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 826; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 827; SI-NEXT: s_mov_b32 s7, 0xf000 828; SI-NEXT: s_mov_b32 s6, -1 829; SI-NEXT: s_waitcnt lgkmcnt(0) 830; SI-NEXT: s_mov_b32 s4, s8 831; SI-NEXT: s_mov_b32 s5, s9 832; SI-NEXT: s_mov_b32 s8, s10 833; SI-NEXT: s_mov_b32 s9, s11 834; SI-NEXT: s_mov_b32 s10, s6 835; SI-NEXT: s_mov_b32 s11, s7 836; SI-NEXT: s_mov_b32 s2, s6 837; SI-NEXT: s_mov_b32 s3, s7 838; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 839; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 840; SI-NEXT: s_waitcnt vmcnt(0) 841; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 842; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 843; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 844; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 845; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 846; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 847; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] 848; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 849; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] 850; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 851; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 852; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 853; SI-NEXT: s_nop 1 854; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] 855; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 856; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 857; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 858; SI-NEXT: s_mov_b32 s1, 0xfffff 859; SI-NEXT: s_mov_b32 s0, s6 860; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 861; SI-NEXT: v_not_b32_e32 v6, v6 862; SI-NEXT: v_and_b32_e32 v6, v4, v6 863; SI-NEXT: v_not_b32_e32 v7, v7 864; SI-NEXT: v_and_b32_e32 v7, v5, v7 865; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 866; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 867; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 868; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 869; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 870; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 871; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 872; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 873; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 874; SI-NEXT: s_endpgm 875; 876; CI-LABEL: frem_f64: 877; CI: ; %bb.0: 878; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 879; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 880; CI-NEXT: s_mov_b32 s11, 0xf000 881; CI-NEXT: s_mov_b32 s10, -1 882; CI-NEXT: s_mov_b32 s2, s10 883; CI-NEXT: s_waitcnt lgkmcnt(0) 884; CI-NEXT: s_mov_b32 s8, s4 885; CI-NEXT: s_mov_b32 s9, s5 886; CI-NEXT: s_mov_b32 s4, s6 887; CI-NEXT: s_mov_b32 s5, s7 888; CI-NEXT: s_mov_b32 s6, s10 889; CI-NEXT: s_mov_b32 s7, s11 890; CI-NEXT: s_mov_b32 s3, s11 891; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 892; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 893; CI-NEXT: s_waitcnt vmcnt(0) 894; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 895; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 896; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 897; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 898; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 899; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 900; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 901; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 902; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 903; CI-NEXT: s_nop 1 904; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 905; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 906; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 907; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 908; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 909; CI-NEXT: s_endpgm 910; 911; VI-LABEL: frem_f64: 912; VI: ; %bb.0: 913; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 914; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 915; VI-NEXT: s_waitcnt lgkmcnt(0) 916; VI-NEXT: v_mov_b32_e32 v2, s6 917; VI-NEXT: v_mov_b32_e32 v3, s7 918; VI-NEXT: v_mov_b32_e32 v4, s0 919; VI-NEXT: v_mov_b32_e32 v5, s1 920; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 921; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 922; VI-NEXT: v_mov_b32_e32 v0, s4 923; VI-NEXT: v_mov_b32_e32 v1, s5 924; VI-NEXT: s_waitcnt vmcnt(0) 925; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] 926; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 927; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 928; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 929; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 930; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 931; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] 932; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 933; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 934; VI-NEXT: s_nop 1 935; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 936; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] 937; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 938; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 939; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 940; VI-NEXT: s_endpgm 941; 942; GFX9-LABEL: frem_f64: 943; GFX9: ; %bb.0: 944; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 945; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 946; GFX9-NEXT: v_mov_b32_e32 v12, 0 947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 948; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 949; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 950; GFX9-NEXT: s_waitcnt vmcnt(0) 951; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 952; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 953; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 954; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 955; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 956; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 957; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 958; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 959; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 960; GFX9-NEXT: s_nop 1 961; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 962; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 963; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 964; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 965; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 966; GFX9-NEXT: s_endpgm 967; 968; GFX10-LABEL: frem_f64: 969; GFX10: ; %bb.0: 970; GFX10-NEXT: s_clause 0x1 971; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 972; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 973; GFX10-NEXT: v_mov_b32_e32 v12, 0 974; GFX10-NEXT: s_waitcnt lgkmcnt(0) 975; GFX10-NEXT: s_clause 0x1 976; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 977; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 978; GFX10-NEXT: s_waitcnt vmcnt(0) 979; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] 980; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 981; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 982; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 983; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 984; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 985; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 986; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 987; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 988; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 989; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 990; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 991; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 992; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 993; GFX10-NEXT: s_endpgm 994 double addrspace(1)* %in2) #0 { 995 %r0 = load double, double addrspace(1)* %in1, align 8 996 %r1 = load double, double addrspace(1)* %in2, align 8 997 %r2 = frem double %r0, %r1 998 store double %r2, double addrspace(1)* %out, align 8 999 ret void 1000} 1001 1002define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1003; SI-LABEL: fast_frem_f64: 1004; SI: ; %bb.0: 1005; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1006; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1007; SI-NEXT: s_mov_b32 s7, 0xf000 1008; SI-NEXT: s_mov_b32 s6, -1 1009; SI-NEXT: s_waitcnt lgkmcnt(0) 1010; SI-NEXT: s_mov_b32 s4, s8 1011; SI-NEXT: s_mov_b32 s5, s9 1012; SI-NEXT: s_mov_b32 s8, s10 1013; SI-NEXT: s_mov_b32 s9, s11 1014; SI-NEXT: s_mov_b32 s10, s6 1015; SI-NEXT: s_mov_b32 s11, s7 1016; SI-NEXT: s_mov_b32 s2, s6 1017; SI-NEXT: s_mov_b32 s3, s7 1018; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1019; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1020; SI-NEXT: s_waitcnt vmcnt(0) 1021; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1022; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1023; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1024; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1025; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1026; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1027; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1028; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1029; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1030; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1031; SI-NEXT: s_mov_b32 s1, 0xfffff 1032; SI-NEXT: s_mov_b32 s0, s6 1033; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1034; SI-NEXT: v_not_b32_e32 v6, v6 1035; SI-NEXT: v_and_b32_e32 v6, v4, v6 1036; SI-NEXT: v_not_b32_e32 v7, v7 1037; SI-NEXT: v_and_b32_e32 v7, v5, v7 1038; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1039; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1040; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1041; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1042; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1043; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1044; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1045; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1046; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1047; SI-NEXT: s_endpgm 1048; 1049; CI-LABEL: fast_frem_f64: 1050; CI: ; %bb.0: 1051; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1052; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1053; CI-NEXT: s_mov_b32 s11, 0xf000 1054; CI-NEXT: s_mov_b32 s10, -1 1055; CI-NEXT: s_mov_b32 s2, s10 1056; CI-NEXT: s_waitcnt lgkmcnt(0) 1057; CI-NEXT: s_mov_b32 s8, s4 1058; CI-NEXT: s_mov_b32 s9, s5 1059; CI-NEXT: s_mov_b32 s4, s6 1060; CI-NEXT: s_mov_b32 s5, s7 1061; CI-NEXT: s_mov_b32 s6, s10 1062; CI-NEXT: s_mov_b32 s7, s11 1063; CI-NEXT: s_mov_b32 s3, s11 1064; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1065; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1066; CI-NEXT: s_waitcnt vmcnt(0) 1067; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1068; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1069; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1070; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1071; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1072; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1073; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1074; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1075; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1076; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1077; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1078; CI-NEXT: s_endpgm 1079; 1080; VI-LABEL: fast_frem_f64: 1081; VI: ; %bb.0: 1082; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1083; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1084; VI-NEXT: s_waitcnt lgkmcnt(0) 1085; VI-NEXT: v_mov_b32_e32 v2, s6 1086; VI-NEXT: v_mov_b32_e32 v3, s7 1087; VI-NEXT: v_mov_b32_e32 v4, s0 1088; VI-NEXT: v_mov_b32_e32 v5, s1 1089; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1090; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1091; VI-NEXT: v_mov_b32_e32 v0, s4 1092; VI-NEXT: v_mov_b32_e32 v1, s5 1093; VI-NEXT: s_waitcnt vmcnt(0) 1094; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1095; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1096; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1097; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1098; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1099; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1100; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1101; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1102; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1103; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1104; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1105; VI-NEXT: s_endpgm 1106; 1107; GFX9-LABEL: fast_frem_f64: 1108; GFX9: ; %bb.0: 1109; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1110; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1111; GFX9-NEXT: v_mov_b32_e32 v10, 0 1112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1114; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1115; GFX9-NEXT: s_waitcnt vmcnt(0) 1116; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1117; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1118; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1119; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1120; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1121; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1122; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1123; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1124; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1125; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1126; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1127; GFX9-NEXT: s_endpgm 1128; 1129; GFX10-LABEL: fast_frem_f64: 1130; GFX10: ; %bb.0: 1131; GFX10-NEXT: s_clause 0x1 1132; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1133; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1134; GFX10-NEXT: v_mov_b32_e32 v10, 0 1135; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX10-NEXT: s_clause 0x1 1137; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1138; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1139; GFX10-NEXT: s_waitcnt vmcnt(0) 1140; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1141; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1142; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1143; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1144; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1145; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1146; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1147; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1148; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1149; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1150; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1151; GFX10-NEXT: s_endpgm 1152 double addrspace(1)* %in2) #0 { 1153 %r0 = load double, double addrspace(1)* %in1, align 8 1154 %r1 = load double, double addrspace(1)* %in2, align 8 1155 %r2 = frem fast double %r0, %r1 1156 store double %r2, double addrspace(1)* %out, align 8 1157 ret void 1158} 1159 1160define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1161; SI-LABEL: unsafe_frem_f64: 1162; SI: ; %bb.0: 1163; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1164; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1165; SI-NEXT: s_mov_b32 s7, 0xf000 1166; SI-NEXT: s_mov_b32 s6, -1 1167; SI-NEXT: s_waitcnt lgkmcnt(0) 1168; SI-NEXT: s_mov_b32 s4, s8 1169; SI-NEXT: s_mov_b32 s5, s9 1170; SI-NEXT: s_mov_b32 s8, s10 1171; SI-NEXT: s_mov_b32 s9, s11 1172; SI-NEXT: s_mov_b32 s10, s6 1173; SI-NEXT: s_mov_b32 s11, s7 1174; SI-NEXT: s_mov_b32 s2, s6 1175; SI-NEXT: s_mov_b32 s3, s7 1176; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1177; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1178; SI-NEXT: s_waitcnt vmcnt(0) 1179; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1180; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1181; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1182; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1183; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1184; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1185; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1186; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1187; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1188; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1189; SI-NEXT: s_mov_b32 s1, 0xfffff 1190; SI-NEXT: s_mov_b32 s0, s6 1191; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1192; SI-NEXT: v_not_b32_e32 v6, v6 1193; SI-NEXT: v_and_b32_e32 v6, v4, v6 1194; SI-NEXT: v_not_b32_e32 v7, v7 1195; SI-NEXT: v_and_b32_e32 v7, v5, v7 1196; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1197; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1198; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1199; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1200; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1201; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1202; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1203; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1204; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1205; SI-NEXT: s_endpgm 1206; 1207; CI-LABEL: unsafe_frem_f64: 1208; CI: ; %bb.0: 1209; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1210; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1211; CI-NEXT: s_mov_b32 s11, 0xf000 1212; CI-NEXT: s_mov_b32 s10, -1 1213; CI-NEXT: s_mov_b32 s2, s10 1214; CI-NEXT: s_waitcnt lgkmcnt(0) 1215; CI-NEXT: s_mov_b32 s8, s4 1216; CI-NEXT: s_mov_b32 s9, s5 1217; CI-NEXT: s_mov_b32 s4, s6 1218; CI-NEXT: s_mov_b32 s5, s7 1219; CI-NEXT: s_mov_b32 s6, s10 1220; CI-NEXT: s_mov_b32 s7, s11 1221; CI-NEXT: s_mov_b32 s3, s11 1222; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1223; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1224; CI-NEXT: s_waitcnt vmcnt(0) 1225; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1226; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1227; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1228; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1229; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1230; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1231; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1232; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1233; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1234; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1235; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1236; CI-NEXT: s_endpgm 1237; 1238; VI-LABEL: unsafe_frem_f64: 1239; VI: ; %bb.0: 1240; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1241; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1242; VI-NEXT: s_waitcnt lgkmcnt(0) 1243; VI-NEXT: v_mov_b32_e32 v2, s6 1244; VI-NEXT: v_mov_b32_e32 v3, s7 1245; VI-NEXT: v_mov_b32_e32 v4, s0 1246; VI-NEXT: v_mov_b32_e32 v5, s1 1247; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1248; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1249; VI-NEXT: v_mov_b32_e32 v0, s4 1250; VI-NEXT: v_mov_b32_e32 v1, s5 1251; VI-NEXT: s_waitcnt vmcnt(0) 1252; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1253; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1254; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1255; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1256; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1257; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1258; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1259; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1260; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1261; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1262; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1263; VI-NEXT: s_endpgm 1264; 1265; GFX9-LABEL: unsafe_frem_f64: 1266; GFX9: ; %bb.0: 1267; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1268; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1269; GFX9-NEXT: v_mov_b32_e32 v10, 0 1270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1272; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1273; GFX9-NEXT: s_waitcnt vmcnt(0) 1274; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1275; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1276; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1277; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1278; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1279; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1280; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1281; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1282; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1283; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1284; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1285; GFX9-NEXT: s_endpgm 1286; 1287; GFX10-LABEL: unsafe_frem_f64: 1288; GFX10: ; %bb.0: 1289; GFX10-NEXT: s_clause 0x1 1290; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1291; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1292; GFX10-NEXT: v_mov_b32_e32 v10, 0 1293; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX10-NEXT: s_clause 0x1 1295; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1296; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1297; GFX10-NEXT: s_waitcnt vmcnt(0) 1298; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1299; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1300; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1301; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1302; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1303; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1304; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1305; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1306; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1307; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1308; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1309; GFX10-NEXT: s_endpgm 1310 double addrspace(1)* %in2) #1 { 1311 %r0 = load double, double addrspace(1)* %in1, align 8 1312 %r1 = load double, double addrspace(1)* %in2, align 8 1313 %r2 = frem afn double %r0, %r1 1314 store double %r2, double addrspace(1)* %out, align 8 1315 ret void 1316} 1317 1318define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 1319; SI-LABEL: frem_v2f16: 1320; SI: ; %bb.0: 1321; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1322; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1323; SI-NEXT: s_mov_b32 s3, 0xf000 1324; SI-NEXT: s_mov_b32 s2, -1 1325; SI-NEXT: s_waitcnt lgkmcnt(0) 1326; SI-NEXT: s_mov_b32 s0, s4 1327; SI-NEXT: s_mov_b32 s1, s5 1328; SI-NEXT: s_mov_b32 s4, s6 1329; SI-NEXT: s_mov_b32 s5, s7 1330; SI-NEXT: s_mov_b32 s6, s2 1331; SI-NEXT: s_mov_b32 s7, s3 1332; SI-NEXT: s_mov_b32 s10, s2 1333; SI-NEXT: s_mov_b32 s11, s3 1334; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1335; SI-NEXT: s_waitcnt vmcnt(0) 1336; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1337; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1338; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1339; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1340; SI-NEXT: s_waitcnt vmcnt(0) 1341; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 1342; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1343; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1344; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1345; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1346; SI-NEXT: v_rcp_f32_e32 v6, v5 1347; SI-NEXT: s_mov_b32 s6, 3 1348; SI-NEXT: s_mov_b32 s7, 0 1349; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1350; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1351; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1352; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1353; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1354; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1355; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1356; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1357; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1358; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1359; SI-NEXT: v_trunc_f32_e32 v4, v4 1360; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 1361; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1362; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1363; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1364; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1365; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1366; SI-NEXT: v_rcp_f32_e32 v5, v4 1367; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1368; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1369; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1370; SI-NEXT: v_mul_f32_e32 v6, v2, v5 1371; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 1372; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1373; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 1374; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1375; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1376; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1377; SI-NEXT: v_trunc_f32_e32 v2, v2 1378; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 1379; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1380; SI-NEXT: v_or_b32_e32 v0, v1, v0 1381; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1382; SI-NEXT: s_endpgm 1383; 1384; CI-LABEL: frem_v2f16: 1385; CI: ; %bb.0: 1386; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1387; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1388; CI-NEXT: s_mov_b32 s3, 0xf000 1389; CI-NEXT: s_mov_b32 s2, -1 1390; CI-NEXT: s_mov_b32 s10, s2 1391; CI-NEXT: s_waitcnt lgkmcnt(0) 1392; CI-NEXT: s_mov_b32 s0, s4 1393; CI-NEXT: s_mov_b32 s1, s5 1394; CI-NEXT: s_mov_b32 s4, s6 1395; CI-NEXT: s_mov_b32 s5, s7 1396; CI-NEXT: s_mov_b32 s6, s2 1397; CI-NEXT: s_mov_b32 s7, s3 1398; CI-NEXT: s_mov_b32 s11, s3 1399; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1400; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1401; CI-NEXT: s_mov_b32 s6, 3 1402; CI-NEXT: s_mov_b32 s7, 0 1403; CI-NEXT: s_waitcnt vmcnt(1) 1404; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1405; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1406; CI-NEXT: s_waitcnt vmcnt(0) 1407; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 1408; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1409; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1410; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1411; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1412; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1413; CI-NEXT: v_rcp_f32_e32 v6, v5 1414; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1415; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1416; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1417; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1418; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1419; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1420; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1421; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1422; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1423; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1424; CI-NEXT: v_trunc_f32_e32 v4, v4 1425; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 1426; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1427; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1428; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1429; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1430; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1431; CI-NEXT: v_rcp_f32_e32 v5, v4 1432; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1433; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1434; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1435; CI-NEXT: v_mul_f32_e32 v6, v2, v5 1436; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 1437; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1438; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 1439; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1440; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1441; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1442; CI-NEXT: v_trunc_f32_e32 v2, v2 1443; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 1444; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1445; CI-NEXT: v_or_b32_e32 v0, v1, v0 1446; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1447; CI-NEXT: s_endpgm 1448; 1449; VI-LABEL: frem_v2f16: 1450; VI: ; %bb.0: 1451; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1452; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1453; VI-NEXT: s_waitcnt lgkmcnt(0) 1454; VI-NEXT: v_mov_b32_e32 v2, s6 1455; VI-NEXT: s_add_u32 s0, s0, 16 1456; VI-NEXT: v_mov_b32_e32 v3, s7 1457; VI-NEXT: s_addc_u32 s1, s1, 0 1458; VI-NEXT: flat_load_dword v4, v[2:3] 1459; VI-NEXT: v_mov_b32_e32 v3, s1 1460; VI-NEXT: v_mov_b32_e32 v2, s0 1461; VI-NEXT: flat_load_dword v2, v[2:3] 1462; VI-NEXT: v_mov_b32_e32 v0, s4 1463; VI-NEXT: v_mov_b32_e32 v1, s5 1464; VI-NEXT: s_waitcnt vmcnt(1) 1465; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1466; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 1467; VI-NEXT: s_waitcnt vmcnt(0) 1468; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1469; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1470; VI-NEXT: v_rcp_f32_e32 v7, v7 1471; VI-NEXT: v_mul_f32_e32 v5, v5, v7 1472; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1473; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 1474; VI-NEXT: v_trunc_f16_e32 v5, v5 1475; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 1476; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1477; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 1478; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1479; VI-NEXT: v_rcp_f32_e32 v6, v6 1480; VI-NEXT: v_mul_f32_e32 v5, v5, v6 1481; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1482; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 1483; VI-NEXT: v_trunc_f16_e32 v5, v5 1484; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 1485; VI-NEXT: v_or_b32_e32 v2, v2, v3 1486; VI-NEXT: flat_store_dword v[0:1], v2 1487; VI-NEXT: s_endpgm 1488; 1489; GFX9-LABEL: frem_v2f16: 1490; GFX9: ; %bb.0: 1491; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1492; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1493; GFX9-NEXT: v_mov_b32_e32 v0, 0 1494; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 1496; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1497; GFX9-NEXT: s_waitcnt vmcnt(1) 1498; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 1499; GFX9-NEXT: s_waitcnt vmcnt(0) 1500; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 1501; GFX9-NEXT: v_rcp_f32_e32 v4, v4 1502; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 1503; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1504; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1505; GFX9-NEXT: v_trunc_f16_e32 v3, v3 1506; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 1507; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1508; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 1509; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1510; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 1511; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1512; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 1513; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 1514; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1515; GFX9-NEXT: v_trunc_f16_e32 v4, v4 1516; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 1517; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 1518; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 1519; GFX9-NEXT: s_endpgm 1520; 1521; GFX10-LABEL: frem_v2f16: 1522; GFX10: ; %bb.0: 1523; GFX10-NEXT: s_clause 0x1 1524; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1525; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1526; GFX10-NEXT: v_mov_b32_e32 v0, 0 1527; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1528; GFX10-NEXT: s_clause 0x1 1529; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 1530; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1531; GFX10-NEXT: s_waitcnt vmcnt(1) 1532; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 1533; GFX10-NEXT: s_waitcnt vmcnt(0) 1534; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 1535; GFX10-NEXT: v_rcp_f32_e32 v4, v4 1536; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 1537; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 1538; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1539; GFX10-NEXT: v_trunc_f16_e32 v3, v3 1540; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 1541; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1542; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1543; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 1544; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 1545; GFX10-NEXT: v_rcp_f32_e32 v5, v5 1546; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 1547; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 1548; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1549; GFX10-NEXT: v_trunc_f16_e32 v4, v4 1550; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 1551; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 1552; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 1553; GFX10-NEXT: s_endpgm 1554 <2 x half> addrspace(1)* %in2) #0 { 1555 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 1556 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 1557 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8 1558 %r2 = frem <2 x half> %r0, %r1 1559 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8 1560 ret void 1561} 1562 1563define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, 1564; SI-LABEL: frem_v4f16: 1565; SI: ; %bb.0: 1566; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1567; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1568; SI-NEXT: s_mov_b32 s3, 0xf000 1569; SI-NEXT: s_mov_b32 s2, -1 1570; SI-NEXT: s_waitcnt lgkmcnt(0) 1571; SI-NEXT: s_mov_b32 s0, s4 1572; SI-NEXT: s_mov_b32 s1, s5 1573; SI-NEXT: s_mov_b32 s4, s6 1574; SI-NEXT: s_mov_b32 s5, s7 1575; SI-NEXT: s_mov_b32 s6, s2 1576; SI-NEXT: s_mov_b32 s7, s3 1577; SI-NEXT: s_mov_b32 s10, s2 1578; SI-NEXT: s_mov_b32 s11, s3 1579; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1580; SI-NEXT: s_waitcnt vmcnt(0) 1581; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1582; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1583; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 1584; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 1585; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1586; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 1587; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1588; SI-NEXT: s_waitcnt vmcnt(0) 1589; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1590; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1591; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1592; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1593; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1594; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1595; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1596; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1597; SI-NEXT: v_rcp_f32_e32 v10, v9 1598; SI-NEXT: s_mov_b32 s6, 3 1599; SI-NEXT: s_mov_b32 s7, 0 1600; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1601; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1602; SI-NEXT: v_fma_f32 v10, v11, v10, v10 1603; SI-NEXT: v_mul_f32_e32 v11, v8, v10 1604; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 1605; SI-NEXT: v_fma_f32 v11, v12, v10, v11 1606; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 1607; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1608; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1609; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1610; SI-NEXT: v_trunc_f32_e32 v8, v8 1611; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 1612; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1613; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1614; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1615; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1616; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1617; SI-NEXT: v_rcp_f32_e32 v9, v8 1618; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1619; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1620; SI-NEXT: v_fma_f32 v9, v10, v9, v9 1621; SI-NEXT: v_mul_f32_e32 v10, v5, v9 1622; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 1623; SI-NEXT: v_fma_f32 v10, v11, v9, v10 1624; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 1625; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1626; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1627; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1628; SI-NEXT: v_trunc_f32_e32 v5, v5 1629; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1630; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1631; SI-NEXT: v_or_b32_e32 v1, v4, v1 1632; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1633; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1634; SI-NEXT: v_rcp_f32_e32 v7, v5 1635; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1636; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1637; SI-NEXT: v_fma_f32 v7, v8, v7, v7 1638; SI-NEXT: v_mul_f32_e32 v8, v4, v7 1639; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 1640; SI-NEXT: v_fma_f32 v8, v9, v7, v8 1641; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 1642; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1643; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1644; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1645; SI-NEXT: v_trunc_f32_e32 v4, v4 1646; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 1647; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1648; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1649; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1650; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1651; SI-NEXT: v_rcp_f32_e32 v5, v4 1652; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1653; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1654; SI-NEXT: v_fma_f32 v5, v7, v5, v5 1655; SI-NEXT: v_mul_f32_e32 v7, v3, v5 1656; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 1657; SI-NEXT: v_fma_f32 v7, v8, v5, v7 1658; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 1659; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1660; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1661; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1662; SI-NEXT: v_trunc_f32_e32 v3, v3 1663; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 1664; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1665; SI-NEXT: v_or_b32_e32 v0, v2, v0 1666; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1667; SI-NEXT: s_endpgm 1668; 1669; CI-LABEL: frem_v4f16: 1670; CI: ; %bb.0: 1671; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1672; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1673; CI-NEXT: s_mov_b32 s3, 0xf000 1674; CI-NEXT: s_mov_b32 s2, -1 1675; CI-NEXT: s_mov_b32 s10, s2 1676; CI-NEXT: s_waitcnt lgkmcnt(0) 1677; CI-NEXT: s_mov_b32 s0, s4 1678; CI-NEXT: s_mov_b32 s1, s5 1679; CI-NEXT: s_mov_b32 s4, s6 1680; CI-NEXT: s_mov_b32 s5, s7 1681; CI-NEXT: s_mov_b32 s6, s2 1682; CI-NEXT: s_mov_b32 s7, s3 1683; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1684; CI-NEXT: s_mov_b32 s11, s3 1685; CI-NEXT: s_mov_b32 s6, 3 1686; CI-NEXT: s_mov_b32 s7, 0 1687; CI-NEXT: s_waitcnt vmcnt(0) 1688; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 1689; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1690; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 1691; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1692; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 1693; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 1694; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1695; CI-NEXT: s_waitcnt vmcnt(0) 1696; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1697; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1698; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1699; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1700; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1701; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1702; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1703; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1704; CI-NEXT: v_rcp_f32_e32 v10, v9 1705; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1706; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1707; CI-NEXT: v_fma_f32 v10, v11, v10, v10 1708; CI-NEXT: v_mul_f32_e32 v11, v8, v10 1709; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 1710; CI-NEXT: v_fma_f32 v11, v12, v10, v11 1711; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 1712; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1713; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1714; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1715; CI-NEXT: v_trunc_f32_e32 v8, v8 1716; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 1717; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1718; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1719; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1720; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1721; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1722; CI-NEXT: v_rcp_f32_e32 v9, v8 1723; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1724; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1725; CI-NEXT: v_fma_f32 v9, v10, v9, v9 1726; CI-NEXT: v_mul_f32_e32 v10, v5, v9 1727; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 1728; CI-NEXT: v_fma_f32 v10, v11, v9, v10 1729; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 1730; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1731; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1732; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1733; CI-NEXT: v_trunc_f32_e32 v5, v5 1734; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1735; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1736; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 1737; CI-NEXT: v_or_b32_e32 v1, v4, v1 1738; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1739; CI-NEXT: v_rcp_f32_e32 v7, v5 1740; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1741; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1742; CI-NEXT: v_fma_f32 v7, v8, v7, v7 1743; CI-NEXT: v_mul_f32_e32 v8, v4, v7 1744; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 1745; CI-NEXT: v_fma_f32 v8, v9, v7, v8 1746; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 1747; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1748; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1749; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1750; CI-NEXT: v_trunc_f32_e32 v4, v4 1751; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 1752; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1753; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1754; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1755; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1756; CI-NEXT: v_rcp_f32_e32 v5, v4 1757; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1758; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1759; CI-NEXT: v_fma_f32 v5, v7, v5, v5 1760; CI-NEXT: v_mul_f32_e32 v7, v3, v5 1761; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 1762; CI-NEXT: v_fma_f32 v7, v8, v5, v7 1763; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 1764; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1765; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1766; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1767; CI-NEXT: v_trunc_f32_e32 v3, v3 1768; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 1769; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1770; CI-NEXT: v_or_b32_e32 v0, v2, v0 1771; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1772; CI-NEXT: s_endpgm 1773; 1774; VI-LABEL: frem_v4f16: 1775; VI: ; %bb.0: 1776; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1777; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1778; VI-NEXT: s_waitcnt lgkmcnt(0) 1779; VI-NEXT: v_mov_b32_e32 v2, s6 1780; VI-NEXT: s_add_u32 s0, s0, 32 1781; VI-NEXT: s_addc_u32 s1, s1, 0 1782; VI-NEXT: v_mov_b32_e32 v5, s1 1783; VI-NEXT: v_mov_b32_e32 v4, s0 1784; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1785; VI-NEXT: v_mov_b32_e32 v3, s7 1786; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1787; VI-NEXT: v_mov_b32_e32 v0, s4 1788; VI-NEXT: v_mov_b32_e32 v1, s5 1789; VI-NEXT: s_waitcnt vmcnt(1) 1790; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 1791; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 1792; VI-NEXT: s_waitcnt vmcnt(0) 1793; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1794; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1795; VI-NEXT: v_rcp_f32_e32 v9, v9 1796; VI-NEXT: v_mul_f32_e32 v7, v7, v9 1797; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1798; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 1799; VI-NEXT: v_trunc_f16_e32 v7, v7 1800; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 1801; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 1802; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 1803; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1804; VI-NEXT: v_rcp_f32_e32 v8, v8 1805; VI-NEXT: v_mul_f32_e32 v7, v7, v8 1806; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1807; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 1808; VI-NEXT: v_trunc_f16_e32 v7, v7 1809; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 1810; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1811; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1812; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1813; VI-NEXT: v_or_b32_e32 v3, v3, v6 1814; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1815; VI-NEXT: v_rcp_f32_e32 v8, v8 1816; VI-NEXT: v_mul_f32_e32 v6, v6, v8 1817; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1818; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 1819; VI-NEXT: v_trunc_f16_e32 v6, v6 1820; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 1821; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 1822; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1823; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1824; VI-NEXT: v_rcp_f32_e32 v7, v7 1825; VI-NEXT: v_mul_f32_e32 v6, v6, v7 1826; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1827; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 1828; VI-NEXT: v_trunc_f16_e32 v6, v6 1829; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 1830; VI-NEXT: v_or_b32_e32 v2, v2, v5 1831; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1832; VI-NEXT: s_endpgm 1833; 1834; GFX9-LABEL: frem_v4f16: 1835; GFX9: ; %bb.0: 1836; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1837; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1838; GFX9-NEXT: v_mov_b32_e32 v4, 0 1839; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1840; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 1841; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 1842; GFX9-NEXT: s_waitcnt vmcnt(1) 1843; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 1844; GFX9-NEXT: s_waitcnt vmcnt(0) 1845; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 1846; GFX9-NEXT: v_rcp_f32_e32 v6, v6 1847; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 1848; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 1849; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 1850; GFX9-NEXT: v_trunc_f16_e32 v5, v5 1851; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 1852; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1853; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 1854; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1855; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 1856; GFX9-NEXT: v_rcp_f32_e32 v7, v7 1857; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 1858; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 1859; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 1860; GFX9-NEXT: v_trunc_f16_e32 v6, v6 1861; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 1862; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 1863; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 1864; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 1865; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1866; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 1867; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1868; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 1869; GFX9-NEXT: v_trunc_f16_e32 v3, v3 1870; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 1871; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1872; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 1873; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1874; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 1875; GFX9-NEXT: v_rcp_f32_e32 v6, v6 1876; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 1877; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 1878; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 1879; GFX9-NEXT: v_trunc_f16_e32 v5, v5 1880; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 1881; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 1882; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 1883; GFX9-NEXT: s_endpgm 1884; 1885; GFX10-LABEL: frem_v4f16: 1886; GFX10: ; %bb.0: 1887; GFX10-NEXT: s_clause 0x1 1888; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1889; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1890; GFX10-NEXT: v_mov_b32_e32 v4, 0 1891; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX10-NEXT: s_clause 0x1 1893; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 1894; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 1895; GFX10-NEXT: s_waitcnt vmcnt(1) 1896; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 1897; GFX10-NEXT: s_waitcnt vmcnt(0) 1898; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 1899; GFX10-NEXT: v_rcp_f32_e32 v6, v6 1900; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 1901; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 1902; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 1903; GFX10-NEXT: v_trunc_f16_e32 v5, v5 1904; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 1905; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1906; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1907; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 1908; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 1909; GFX10-NEXT: v_rcp_f32_e32 v7, v7 1910; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 1911; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 1912; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 1913; GFX10-NEXT: v_trunc_f16_e32 v6, v6 1914; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 1915; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 1916; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 1917; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 1918; GFX10-NEXT: v_rcp_f32_e32 v5, v5 1919; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 1920; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 1921; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 1922; GFX10-NEXT: v_trunc_f16_e32 v3, v3 1923; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 1924; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1925; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1926; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 1927; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 1928; GFX10-NEXT: v_rcp_f32_e32 v6, v6 1929; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 1930; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 1931; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 1932; GFX10-NEXT: v_trunc_f16_e32 v5, v5 1933; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 1934; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 1935; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 1936; GFX10-NEXT: s_endpgm 1937 <4 x half> addrspace(1)* %in2) #0 { 1938 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 1939 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 1940 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16 1941 %r2 = frem <4 x half> %r0, %r1 1942 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16 1943 ret void 1944} 1945 1946define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, 1947; SI-LABEL: frem_v2f32: 1948; SI: ; %bb.0: 1949; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1950; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1951; SI-NEXT: s_mov_b32 s3, 0xf000 1952; SI-NEXT: s_mov_b32 s2, -1 1953; SI-NEXT: s_waitcnt lgkmcnt(0) 1954; SI-NEXT: s_mov_b32 s0, s4 1955; SI-NEXT: s_mov_b32 s1, s5 1956; SI-NEXT: s_mov_b32 s4, s6 1957; SI-NEXT: s_mov_b32 s5, s7 1958; SI-NEXT: s_mov_b32 s6, s2 1959; SI-NEXT: s_mov_b32 s7, s3 1960; SI-NEXT: s_mov_b32 s10, s2 1961; SI-NEXT: s_mov_b32 s11, s3 1962; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1963; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 1964; SI-NEXT: s_waitcnt vmcnt(0) 1965; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 1966; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 1967; SI-NEXT: v_rcp_f32_e32 v6, v5 1968; SI-NEXT: s_mov_b32 s6, 3 1969; SI-NEXT: s_mov_b32 s7, 0 1970; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1971; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1972; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1973; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1974; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1975; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1976; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1977; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1978; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1979; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 1980; SI-NEXT: v_trunc_f32_e32 v4, v4 1981; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 1982; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 1983; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1984; SI-NEXT: v_rcp_f32_e32 v5, v4 1985; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1986; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1987; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1988; SI-NEXT: v_mul_f32_e32 v6, v3, v5 1989; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 1990; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1991; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 1992; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1993; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 1994; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 1995; SI-NEXT: v_trunc_f32_e32 v3, v3 1996; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 1997; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1998; SI-NEXT: s_endpgm 1999; 2000; CI-LABEL: frem_v2f32: 2001; CI: ; %bb.0: 2002; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2003; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2004; CI-NEXT: s_mov_b32 s3, 0xf000 2005; CI-NEXT: s_mov_b32 s2, -1 2006; CI-NEXT: s_mov_b32 s10, s2 2007; CI-NEXT: s_waitcnt lgkmcnt(0) 2008; CI-NEXT: s_mov_b32 s0, s4 2009; CI-NEXT: s_mov_b32 s1, s5 2010; CI-NEXT: s_mov_b32 s4, s6 2011; CI-NEXT: s_mov_b32 s5, s7 2012; CI-NEXT: s_mov_b32 s6, s2 2013; CI-NEXT: s_mov_b32 s7, s3 2014; CI-NEXT: s_mov_b32 s11, s3 2015; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2016; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 2017; CI-NEXT: s_mov_b32 s6, 3 2018; CI-NEXT: s_mov_b32 s7, 0 2019; CI-NEXT: s_waitcnt vmcnt(0) 2020; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2021; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2022; CI-NEXT: v_rcp_f32_e32 v6, v5 2023; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2024; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2025; CI-NEXT: v_fma_f32 v6, v7, v6, v6 2026; CI-NEXT: v_mul_f32_e32 v7, v4, v6 2027; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 2028; CI-NEXT: v_fma_f32 v7, v8, v6, v7 2029; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2030; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2031; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2032; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2033; CI-NEXT: v_trunc_f32_e32 v4, v4 2034; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 2035; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2036; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2037; CI-NEXT: v_rcp_f32_e32 v5, v4 2038; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2039; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2040; CI-NEXT: v_fma_f32 v5, v6, v5, v5 2041; CI-NEXT: v_mul_f32_e32 v6, v3, v5 2042; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 2043; CI-NEXT: v_fma_f32 v6, v7, v5, v6 2044; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 2045; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2046; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 2047; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2048; CI-NEXT: v_trunc_f32_e32 v3, v3 2049; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 2050; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2051; CI-NEXT: s_endpgm 2052; 2053; VI-LABEL: frem_v2f32: 2054; VI: ; %bb.0: 2055; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2056; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2057; VI-NEXT: s_mov_b32 s2, 3 2058; VI-NEXT: s_mov_b32 s3, 0 2059; VI-NEXT: s_waitcnt lgkmcnt(0) 2060; VI-NEXT: v_mov_b32_e32 v2, s6 2061; VI-NEXT: s_add_u32 s0, s0, 32 2062; VI-NEXT: s_addc_u32 s1, s1, 0 2063; VI-NEXT: v_mov_b32_e32 v5, s1 2064; VI-NEXT: v_mov_b32_e32 v3, s7 2065; VI-NEXT: v_mov_b32_e32 v4, s0 2066; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2067; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 2068; VI-NEXT: v_mov_b32_e32 v0, s4 2069; VI-NEXT: v_mov_b32_e32 v1, s5 2070; VI-NEXT: s_waitcnt vmcnt(0) 2071; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 2072; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 2073; VI-NEXT: v_rcp_f32_e32 v8, v7 2074; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2075; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2076; VI-NEXT: v_fma_f32 v8, v9, v8, v8 2077; VI-NEXT: v_mul_f32_e32 v9, v6, v8 2078; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 2079; VI-NEXT: v_fma_f32 v9, v10, v8, v9 2080; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 2081; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2082; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2083; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 2084; VI-NEXT: v_trunc_f32_e32 v6, v6 2085; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 2086; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 2087; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 2088; VI-NEXT: v_rcp_f32_e32 v7, v6 2089; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2090; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2091; VI-NEXT: v_fma_f32 v7, v8, v7, v7 2092; VI-NEXT: v_mul_f32_e32 v8, v5, v7 2093; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 2094; VI-NEXT: v_fma_f32 v8, v9, v7, v8 2095; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 2096; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2097; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2098; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 2099; VI-NEXT: v_trunc_f32_e32 v5, v5 2100; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 2101; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2102; VI-NEXT: s_endpgm 2103; 2104; GFX9-LABEL: frem_v2f32: 2105; GFX9: ; %bb.0: 2106; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2107; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2108; GFX9-NEXT: v_mov_b32_e32 v4, 0 2109; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2110; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2111; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2112; GFX9-NEXT: s_mov_b32 s2, 3 2113; GFX9-NEXT: s_mov_b32 s3, 0 2114; GFX9-NEXT: s_waitcnt vmcnt(0) 2115; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 2116; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 2117; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2118; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2119; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2120; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 2121; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 2122; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 2123; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 2124; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 2125; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2126; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2127; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2128; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2129; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 2130; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 2131; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2132; GFX9-NEXT: v_rcp_f32_e32 v6, v5 2133; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2134; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2135; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 2136; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 2137; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 2138; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 2139; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 2140; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2141; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2142; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2143; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2144; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 2145; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2146; GFX9-NEXT: s_endpgm 2147; 2148; GFX10-LABEL: frem_v2f32: 2149; GFX10: ; %bb.0: 2150; GFX10-NEXT: s_clause 0x1 2151; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2152; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2153; GFX10-NEXT: v_mov_b32_e32 v4, 0 2154; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2155; GFX10-NEXT: s_clause 0x1 2156; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2157; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2158; GFX10-NEXT: s_waitcnt vmcnt(0) 2159; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 2160; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 2161; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2162; GFX10-NEXT: s_denorm_mode 15 2163; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2164; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 2165; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 2166; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 2167; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 2168; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 2169; GFX10-NEXT: s_denorm_mode 12 2170; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2171; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2172; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2173; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 2174; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 2175; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 2176; GFX10-NEXT: v_rcp_f32_e32 v6, v5 2177; GFX10-NEXT: s_denorm_mode 15 2178; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2179; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 2180; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 2181; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 2182; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 2183; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 2184; GFX10-NEXT: s_denorm_mode 12 2185; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2186; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2187; GFX10-NEXT: v_trunc_f32_e32 v3, v3 2188; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 2189; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2190; GFX10-NEXT: s_endpgm 2191 <2 x float> addrspace(1)* %in2) #0 { 2192 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 2193 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 2194 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 2195 %r2 = frem <2 x float> %r0, %r1 2196 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 2197 ret void 2198} 2199 2200define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, 2201; SI-LABEL: frem_v4f32: 2202; SI: ; %bb.0: 2203; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2204; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2205; SI-NEXT: s_mov_b32 s3, 0xf000 2206; SI-NEXT: s_mov_b32 s2, -1 2207; SI-NEXT: s_waitcnt lgkmcnt(0) 2208; SI-NEXT: s_mov_b32 s0, s4 2209; SI-NEXT: s_mov_b32 s1, s5 2210; SI-NEXT: s_mov_b32 s4, s6 2211; SI-NEXT: s_mov_b32 s5, s7 2212; SI-NEXT: s_mov_b32 s6, s2 2213; SI-NEXT: s_mov_b32 s7, s3 2214; SI-NEXT: s_mov_b32 s10, s2 2215; SI-NEXT: s_mov_b32 s11, s3 2216; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2217; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2218; SI-NEXT: s_waitcnt vmcnt(0) 2219; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2220; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2221; SI-NEXT: v_rcp_f32_e32 v10, v9 2222; SI-NEXT: s_mov_b32 s6, 3 2223; SI-NEXT: s_mov_b32 s7, 0 2224; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2225; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2226; SI-NEXT: v_fma_f32 v10, v11, v10, v10 2227; SI-NEXT: v_mul_f32_e32 v11, v8, v10 2228; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 2229; SI-NEXT: v_fma_f32 v11, v12, v10, v11 2230; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 2231; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2232; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2233; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2234; SI-NEXT: v_trunc_f32_e32 v8, v8 2235; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 2236; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2237; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2238; SI-NEXT: v_rcp_f32_e32 v9, v8 2239; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2240; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2241; SI-NEXT: v_fma_f32 v9, v10, v9, v9 2242; SI-NEXT: v_mul_f32_e32 v10, v7, v9 2243; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 2244; SI-NEXT: v_fma_f32 v10, v11, v9, v10 2245; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 2246; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2247; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2248; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2249; SI-NEXT: v_trunc_f32_e32 v7, v7 2250; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 2251; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2252; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2253; SI-NEXT: v_rcp_f32_e32 v8, v7 2254; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2255; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2256; SI-NEXT: v_fma_f32 v8, v9, v8, v8 2257; SI-NEXT: v_mul_f32_e32 v9, v6, v8 2258; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 2259; SI-NEXT: v_fma_f32 v9, v10, v8, v9 2260; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 2261; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2262; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2263; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2264; SI-NEXT: v_trunc_f32_e32 v6, v6 2265; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 2266; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2267; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2268; SI-NEXT: v_rcp_f32_e32 v7, v6 2269; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2270; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2271; SI-NEXT: v_fma_f32 v7, v8, v7, v7 2272; SI-NEXT: v_mul_f32_e32 v8, v5, v7 2273; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 2274; SI-NEXT: v_fma_f32 v8, v9, v7, v8 2275; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 2276; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2277; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2278; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2279; SI-NEXT: v_trunc_f32_e32 v5, v5 2280; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 2281; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2282; SI-NEXT: s_endpgm 2283; 2284; CI-LABEL: frem_v4f32: 2285; CI: ; %bb.0: 2286; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2287; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2288; CI-NEXT: s_mov_b32 s3, 0xf000 2289; CI-NEXT: s_mov_b32 s2, -1 2290; CI-NEXT: s_mov_b32 s10, s2 2291; CI-NEXT: s_waitcnt lgkmcnt(0) 2292; CI-NEXT: s_mov_b32 s0, s4 2293; CI-NEXT: s_mov_b32 s1, s5 2294; CI-NEXT: s_mov_b32 s4, s6 2295; CI-NEXT: s_mov_b32 s5, s7 2296; CI-NEXT: s_mov_b32 s6, s2 2297; CI-NEXT: s_mov_b32 s7, s3 2298; CI-NEXT: s_mov_b32 s11, s3 2299; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2300; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2301; CI-NEXT: s_mov_b32 s6, 3 2302; CI-NEXT: s_mov_b32 s7, 0 2303; CI-NEXT: s_waitcnt vmcnt(0) 2304; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2305; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2306; CI-NEXT: v_rcp_f32_e32 v10, v9 2307; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2308; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2309; CI-NEXT: v_fma_f32 v10, v11, v10, v10 2310; CI-NEXT: v_mul_f32_e32 v11, v8, v10 2311; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 2312; CI-NEXT: v_fma_f32 v11, v12, v10, v11 2313; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 2314; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2315; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2316; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2317; CI-NEXT: v_trunc_f32_e32 v8, v8 2318; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 2319; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2320; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2321; CI-NEXT: v_rcp_f32_e32 v9, v8 2322; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2323; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2324; CI-NEXT: v_fma_f32 v9, v10, v9, v9 2325; CI-NEXT: v_mul_f32_e32 v10, v7, v9 2326; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 2327; CI-NEXT: v_fma_f32 v10, v11, v9, v10 2328; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 2329; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2330; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2331; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2332; CI-NEXT: v_trunc_f32_e32 v7, v7 2333; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 2334; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2335; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2336; CI-NEXT: v_rcp_f32_e32 v8, v7 2337; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2338; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2339; CI-NEXT: v_fma_f32 v8, v9, v8, v8 2340; CI-NEXT: v_mul_f32_e32 v9, v6, v8 2341; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 2342; CI-NEXT: v_fma_f32 v9, v10, v8, v9 2343; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 2344; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2345; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2346; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2347; CI-NEXT: v_trunc_f32_e32 v6, v6 2348; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 2349; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2350; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2351; CI-NEXT: v_rcp_f32_e32 v7, v6 2352; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 2353; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2354; CI-NEXT: v_fma_f32 v7, v8, v7, v7 2355; CI-NEXT: v_mul_f32_e32 v8, v5, v7 2356; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 2357; CI-NEXT: v_fma_f32 v8, v9, v7, v8 2358; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 2359; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 2360; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2361; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2362; CI-NEXT: v_trunc_f32_e32 v5, v5 2363; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 2364; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2365; CI-NEXT: s_endpgm 2366; 2367; VI-LABEL: frem_v4f32: 2368; VI: ; %bb.0: 2369; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2370; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2371; VI-NEXT: s_mov_b32 s2, 3 2372; VI-NEXT: s_mov_b32 s3, 0 2373; VI-NEXT: s_waitcnt lgkmcnt(0) 2374; VI-NEXT: v_mov_b32_e32 v0, s6 2375; VI-NEXT: s_add_u32 s0, s0, 64 2376; VI-NEXT: s_addc_u32 s1, s1, 0 2377; VI-NEXT: v_mov_b32_e32 v5, s1 2378; VI-NEXT: v_mov_b32_e32 v1, s7 2379; VI-NEXT: v_mov_b32_e32 v4, s0 2380; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2381; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2382; VI-NEXT: v_mov_b32_e32 v8, s4 2383; VI-NEXT: v_mov_b32_e32 v9, s5 2384; VI-NEXT: s_waitcnt vmcnt(0) 2385; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 2386; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 2387; VI-NEXT: v_rcp_f32_e32 v12, v11 2388; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2389; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 2390; VI-NEXT: v_fma_f32 v12, v13, v12, v12 2391; VI-NEXT: v_mul_f32_e32 v13, v10, v12 2392; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 2393; VI-NEXT: v_fma_f32 v13, v14, v12, v13 2394; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 2395; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2396; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 2397; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 2398; VI-NEXT: v_trunc_f32_e32 v10, v10 2399; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 2400; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 2401; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2402; VI-NEXT: v_rcp_f32_e32 v11, v10 2403; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2404; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2405; VI-NEXT: v_fma_f32 v11, v12, v11, v11 2406; VI-NEXT: v_mul_f32_e32 v12, v7, v11 2407; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 2408; VI-NEXT: v_fma_f32 v12, v13, v11, v12 2409; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 2410; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2411; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 2412; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2413; VI-NEXT: v_trunc_f32_e32 v7, v7 2414; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 2415; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2416; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2417; VI-NEXT: v_rcp_f32_e32 v10, v7 2418; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2419; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 2420; VI-NEXT: v_fma_f32 v10, v11, v10, v10 2421; VI-NEXT: v_mul_f32_e32 v11, v6, v10 2422; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 2423; VI-NEXT: v_fma_f32 v11, v12, v10, v11 2424; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 2425; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2426; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 2427; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2428; VI-NEXT: v_trunc_f32_e32 v6, v6 2429; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 2430; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2431; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2432; VI-NEXT: v_rcp_f32_e32 v7, v6 2433; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2434; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 2435; VI-NEXT: v_fma_f32 v7, v10, v7, v7 2436; VI-NEXT: v_mul_f32_e32 v10, v5, v7 2437; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 2438; VI-NEXT: v_fma_f32 v10, v11, v7, v10 2439; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 2440; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2441; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 2442; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2443; VI-NEXT: v_trunc_f32_e32 v5, v5 2444; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 2445; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2446; VI-NEXT: s_endpgm 2447; 2448; GFX9-LABEL: frem_v4f32: 2449; GFX9: ; %bb.0: 2450; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2451; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2452; GFX9-NEXT: v_mov_b32_e32 v8, 0 2453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2454; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2455; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2456; GFX9-NEXT: s_mov_b32 s2, 3 2457; GFX9-NEXT: s_mov_b32 s3, 0 2458; GFX9-NEXT: s_waitcnt vmcnt(0) 2459; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 2460; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 2461; GFX9-NEXT: v_rcp_f32_e32 v11, v10 2462; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2463; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2464; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 2465; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 2466; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 2467; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 2468; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 2469; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2470; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2471; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2472; GFX9-NEXT: v_trunc_f32_e32 v9, v9 2473; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 2474; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 2475; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2476; GFX9-NEXT: v_rcp_f32_e32 v10, v9 2477; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2478; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2479; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 2480; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 2481; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 2482; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 2483; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 2484; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2485; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2486; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2487; GFX9-NEXT: v_trunc_f32_e32 v7, v7 2488; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 2489; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2490; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2491; GFX9-NEXT: v_rcp_f32_e32 v9, v7 2492; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2493; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2494; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 2495; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 2496; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 2497; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 2498; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 2499; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2500; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2501; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2502; GFX9-NEXT: v_trunc_f32_e32 v6, v6 2503; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 2504; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2505; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2506; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2507; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 2508; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2509; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 2510; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 2511; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 2512; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 2513; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 2514; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 2515; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2516; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2517; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2518; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 2519; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2520; GFX9-NEXT: s_endpgm 2521; 2522; GFX10-LABEL: frem_v4f32: 2523; GFX10: ; %bb.0: 2524; GFX10-NEXT: s_clause 0x1 2525; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2526; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2527; GFX10-NEXT: v_mov_b32_e32 v8, 0 2528; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX10-NEXT: s_clause 0x1 2530; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2531; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2532; GFX10-NEXT: s_waitcnt vmcnt(0) 2533; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 2534; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 2535; GFX10-NEXT: v_rcp_f32_e32 v11, v10 2536; GFX10-NEXT: s_denorm_mode 15 2537; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2538; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 2539; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 2540; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 2541; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 2542; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 2543; GFX10-NEXT: s_denorm_mode 12 2544; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2545; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2546; GFX10-NEXT: v_trunc_f32_e32 v9, v9 2547; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 2548; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 2549; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 2550; GFX10-NEXT: v_rcp_f32_e32 v10, v9 2551; GFX10-NEXT: s_denorm_mode 15 2552; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2553; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 2554; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 2555; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 2556; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 2557; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 2558; GFX10-NEXT: s_denorm_mode 12 2559; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2560; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2561; GFX10-NEXT: v_trunc_f32_e32 v7, v7 2562; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 2563; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 2564; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 2565; GFX10-NEXT: v_rcp_f32_e32 v9, v7 2566; GFX10-NEXT: s_denorm_mode 15 2567; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2568; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 2569; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 2570; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 2571; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 2572; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 2573; GFX10-NEXT: s_denorm_mode 12 2574; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2575; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2576; GFX10-NEXT: v_trunc_f32_e32 v6, v6 2577; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 2578; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 2579; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 2580; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2581; GFX10-NEXT: s_denorm_mode 15 2582; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2583; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 2584; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 2585; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 2586; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 2587; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 2588; GFX10-NEXT: s_denorm_mode 12 2589; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2590; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2591; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2592; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 2593; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2594; GFX10-NEXT: s_endpgm 2595 <4 x float> addrspace(1)* %in2) #0 { 2596 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 2597 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 2598 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 2599 %r2 = frem <4 x float> %r0, %r1 2600 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 2601 ret void 2602} 2603 2604define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, 2605; SI-LABEL: frem_v2f64: 2606; SI: ; %bb.0: 2607; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 2608; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2609; SI-NEXT: s_mov_b32 s7, 0xf000 2610; SI-NEXT: s_mov_b32 s6, -1 2611; SI-NEXT: s_waitcnt lgkmcnt(0) 2612; SI-NEXT: s_mov_b32 s4, s8 2613; SI-NEXT: s_mov_b32 s5, s9 2614; SI-NEXT: s_mov_b32 s8, s10 2615; SI-NEXT: s_mov_b32 s9, s11 2616; SI-NEXT: s_mov_b32 s10, s6 2617; SI-NEXT: s_mov_b32 s11, s7 2618; SI-NEXT: s_mov_b32 s2, s6 2619; SI-NEXT: s_mov_b32 s3, s7 2620; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 2621; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 2622; SI-NEXT: s_waitcnt vmcnt(0) 2623; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 2624; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2625; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2626; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2627; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2628; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2629; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] 2630; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2631; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] 2632; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 2633; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 2634; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 2635; SI-NEXT: s_nop 1 2636; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] 2637; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2638; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 2639; SI-NEXT: s_movk_i32 s8, 0xfc01 2640; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10 2641; SI-NEXT: s_mov_b32 s3, 0xfffff 2642; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 2643; SI-NEXT: v_not_b32_e32 v10, v10 2644; SI-NEXT: v_and_b32_e32 v10, v8, v10 2645; SI-NEXT: v_not_b32_e32 v11, v11 2646; SI-NEXT: v_and_b32_e32 v11, v9, v11 2647; SI-NEXT: s_brev_b32 s9, 1 2648; SI-NEXT: v_and_b32_e32 v13, s9, v9 2649; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 2650; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc 2651; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 2652; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] 2653; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 2654; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] 2655; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2656; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2657; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2658; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2659; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2660; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2661; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2662; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] 2663; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2664; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] 2665; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 2666; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 2667; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 2668; SI-NEXT: s_nop 1 2669; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] 2670; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2671; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 2672; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8 2673; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 2674; SI-NEXT: v_not_b32_e32 v8, v8 2675; SI-NEXT: v_and_b32_e32 v8, v6, v8 2676; SI-NEXT: v_not_b32_e32 v9, v9 2677; SI-NEXT: v_and_b32_e32 v9, v7, v9 2678; SI-NEXT: v_and_b32_e32 v11, s9, v7 2679; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 2680; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 2681; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 2682; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] 2683; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 2684; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] 2685; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2686; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2687; SI-NEXT: s_endpgm 2688; 2689; CI-LABEL: frem_v2f64: 2690; CI: ; %bb.0: 2691; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2692; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2693; CI-NEXT: s_mov_b32 s3, 0xf000 2694; CI-NEXT: s_mov_b32 s2, -1 2695; CI-NEXT: s_mov_b32 s10, s2 2696; CI-NEXT: s_waitcnt lgkmcnt(0) 2697; CI-NEXT: s_mov_b32 s0, s4 2698; CI-NEXT: s_mov_b32 s1, s5 2699; CI-NEXT: s_mov_b32 s4, s6 2700; CI-NEXT: s_mov_b32 s5, s7 2701; CI-NEXT: s_mov_b32 s6, s2 2702; CI-NEXT: s_mov_b32 s7, s3 2703; CI-NEXT: s_mov_b32 s11, s3 2704; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2705; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2706; CI-NEXT: s_waitcnt vmcnt(0) 2707; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] 2708; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2709; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2710; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2711; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2712; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2713; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 2714; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2715; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2716; CI-NEXT: s_nop 1 2717; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2718; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2719; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2720; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2721; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] 2722; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2723; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2724; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2725; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2726; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2727; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 2728; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2729; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2730; CI-NEXT: s_nop 1 2731; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2732; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2733; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2734; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2735; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2736; CI-NEXT: s_endpgm 2737; 2738; VI-LABEL: frem_v2f64: 2739; VI: ; %bb.0: 2740; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2741; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2742; VI-NEXT: s_waitcnt lgkmcnt(0) 2743; VI-NEXT: v_mov_b32_e32 v0, s6 2744; VI-NEXT: s_add_u32 s0, s0, 64 2745; VI-NEXT: s_addc_u32 s1, s1, 0 2746; VI-NEXT: v_mov_b32_e32 v5, s1 2747; VI-NEXT: v_mov_b32_e32 v1, s7 2748; VI-NEXT: v_mov_b32_e32 v4, s0 2749; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2750; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2751; VI-NEXT: v_mov_b32_e32 v8, s4 2752; VI-NEXT: v_mov_b32_e32 v9, s5 2753; VI-NEXT: s_waitcnt vmcnt(0) 2754; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] 2755; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] 2756; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 2757; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 2758; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 2759; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 2760; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] 2761; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] 2762; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] 2763; VI-NEXT: s_nop 1 2764; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] 2765; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] 2766; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] 2767; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] 2768; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2769; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] 2770; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 2771; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2772; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 2773; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2774; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] 2775; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2776; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] 2777; VI-NEXT: s_nop 1 2778; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] 2779; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2780; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2781; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2782; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2783; VI-NEXT: s_endpgm 2784; 2785; GFX9-LABEL: frem_v2f64: 2786; GFX9: ; %bb.0: 2787; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2788; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2789; GFX9-NEXT: v_mov_b32_e32 v16, 0 2790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2791; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 2792; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 2793; GFX9-NEXT: s_waitcnt vmcnt(0) 2794; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 2795; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2796; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2797; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2798; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2799; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2800; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 2801; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2802; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2803; GFX9-NEXT: s_nop 1 2804; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2805; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2806; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2807; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2808; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2809; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2810; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2811; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2812; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2813; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2814; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 2815; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2816; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2817; GFX9-NEXT: s_nop 1 2818; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2819; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2820; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2821; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2822; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 2823; GFX9-NEXT: s_endpgm 2824; 2825; GFX10-LABEL: frem_v2f64: 2826; GFX10: ; %bb.0: 2827; GFX10-NEXT: s_clause 0x1 2828; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2829; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2830; GFX10-NEXT: v_mov_b32_e32 v16, 0 2831; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2832; GFX10-NEXT: s_clause 0x1 2833; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 2834; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 2835; GFX10-NEXT: s_waitcnt vmcnt(0) 2836; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] 2837; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2838; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2839; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2840; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2841; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2842; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 2843; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2844; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2845; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2846; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2847; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2848; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2849; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] 2850; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2851; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2852; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2853; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2854; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2855; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 2856; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2857; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2858; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2859; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2860; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2861; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2862; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 2863; GFX10-NEXT: s_endpgm 2864 <2 x double> addrspace(1)* %in2) #0 { 2865 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 2866 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 2867 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 2868 %r2 = frem <2 x double> %r0, %r1 2869 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 2870 ret void 2871} 2872 2873attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2874attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2875