1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 7 8define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 9; SI-LABEL: frem_f16: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 12; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 13; SI-NEXT: s_mov_b32 s11, 0xf000 14; SI-NEXT: s_mov_b32 s10, -1 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b32 s8, s4 17; SI-NEXT: s_mov_b32 s9, s5 18; SI-NEXT: s_mov_b32 s4, s6 19; SI-NEXT: s_mov_b32 s5, s7 20; SI-NEXT: s_mov_b32 s6, s10 21; SI-NEXT: s_mov_b32 s7, s11 22; SI-NEXT: s_mov_b32 s2, s10 23; SI-NEXT: s_mov_b32 s3, s11 24; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 27; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 30; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 31; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 32; SI-NEXT: v_rcp_f32_e32 v4, v3 33; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 34; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 35; SI-NEXT: v_fma_f32 v4, v5, v4, v4 36; SI-NEXT: v_mul_f32_e32 v5, v2, v4 37; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 38; SI-NEXT: v_fma_f32 v5, v6, v4, v5 39; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 40; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 41; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 42; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 43; SI-NEXT: v_trunc_f32_e32 v2, v2 44; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 45; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 46; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 47; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 48; SI-NEXT: s_endpgm 49; 50; CI-LABEL: frem_f16: 51; CI: ; %bb.0: 52; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 53; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 54; CI-NEXT: s_mov_b32 s11, 0xf000 55; CI-NEXT: s_mov_b32 s10, -1 56; CI-NEXT: s_mov_b32 s2, s10 57; CI-NEXT: s_waitcnt lgkmcnt(0) 58; CI-NEXT: s_mov_b32 s8, s4 59; CI-NEXT: s_mov_b32 s9, s5 60; CI-NEXT: s_mov_b32 s4, s6 61; CI-NEXT: s_mov_b32 s5, s7 62; CI-NEXT: s_mov_b32 s6, s10 63; CI-NEXT: s_mov_b32 s7, s11 64; CI-NEXT: s_mov_b32 s3, s11 65; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 66; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 67; CI-NEXT: s_waitcnt vmcnt(1) 68; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 69; CI-NEXT: s_waitcnt vmcnt(0) 70; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 71; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 72; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 73; CI-NEXT: v_rcp_f32_e32 v4, v3 74; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 75; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 76; CI-NEXT: v_fma_f32 v4, v5, v4, v4 77; CI-NEXT: v_mul_f32_e32 v5, v2, v4 78; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 79; CI-NEXT: v_fma_f32 v5, v6, v4, v5 80; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 81; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 82; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 83; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 84; CI-NEXT: v_trunc_f32_e32 v2, v2 85; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 86; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 87; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 88; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 89; CI-NEXT: s_endpgm 90; 91; VI-LABEL: frem_f16: 92; VI: ; %bb.0: 93; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 94; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 95; VI-NEXT: s_waitcnt lgkmcnt(0) 96; VI-NEXT: v_mov_b32_e32 v2, s6 97; VI-NEXT: s_add_u32 s0, s0, 8 98; VI-NEXT: v_mov_b32_e32 v3, s7 99; VI-NEXT: s_addc_u32 s1, s1, 0 100; VI-NEXT: flat_load_ushort v4, v[2:3] 101; VI-NEXT: v_mov_b32_e32 v3, s1 102; VI-NEXT: v_mov_b32_e32 v2, s0 103; VI-NEXT: flat_load_ushort v2, v[2:3] 104; VI-NEXT: v_mov_b32_e32 v0, s4 105; VI-NEXT: v_mov_b32_e32 v1, s5 106; VI-NEXT: s_waitcnt vmcnt(1) 107; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 108; VI-NEXT: s_waitcnt vmcnt(0) 109; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 110; VI-NEXT: v_rcp_f32_e32 v5, v5 111; VI-NEXT: v_mul_f32_e32 v3, v3, v5 112; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 113; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 114; VI-NEXT: v_trunc_f16_e32 v3, v3 115; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 116; VI-NEXT: flat_store_short v[0:1], v2 117; VI-NEXT: s_endpgm 118; 119; GFX9-LABEL: frem_f16: 120; GFX9: ; %bb.0: 121; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 122; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 123; GFX9-NEXT: v_mov_b32_e32 v0, 0 124; GFX9-NEXT: s_waitcnt lgkmcnt(0) 125; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 126; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 127; GFX9-NEXT: s_waitcnt vmcnt(1) 128; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 129; GFX9-NEXT: s_waitcnt vmcnt(0) 130; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 131; GFX9-NEXT: v_rcp_f32_e32 v4, v4 132; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 133; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 134; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 135; GFX9-NEXT: v_trunc_f16_e32 v3, v3 136; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 137; GFX9-NEXT: global_store_short v0, v1, s[4:5] 138; GFX9-NEXT: s_endpgm 139; 140; GFX10-LABEL: frem_f16: 141; GFX10: ; %bb.0: 142; GFX10-NEXT: s_clause 0x1 143; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 144; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 145; GFX10-NEXT: v_mov_b32_e32 v0, 0 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: s_clause 0x1 148; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 149; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 150; GFX10-NEXT: s_waitcnt vmcnt(1) 151; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 152; GFX10-NEXT: s_waitcnt vmcnt(0) 153; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 154; GFX10-NEXT: v_rcp_f32_e32 v4, v4 155; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 156; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 157; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 158; GFX10-NEXT: v_trunc_f16_e32 v3, v3 159; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 160; GFX10-NEXT: global_store_short v0, v1, s[4:5] 161; GFX10-NEXT: s_endpgm 162 half addrspace(1)* %in2) #0 { 163 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 164 %r0 = load half, half addrspace(1)* %in1, align 4 165 %r1 = load half, half addrspace(1)* %gep2, align 4 166 %r2 = frem half %r0, %r1 167 store half %r2, half addrspace(1)* %out, align 4 168 ret void 169} 170 171define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 172; SI-LABEL: fast_frem_f16: 173; SI: ; %bb.0: 174; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 175; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 176; SI-NEXT: s_mov_b32 s11, 0xf000 177; SI-NEXT: s_mov_b32 s10, -1 178; SI-NEXT: s_waitcnt lgkmcnt(0) 179; SI-NEXT: s_mov_b32 s8, s4 180; SI-NEXT: s_mov_b32 s9, s5 181; SI-NEXT: s_mov_b32 s4, s6 182; SI-NEXT: s_mov_b32 s5, s7 183; SI-NEXT: s_mov_b32 s6, s10 184; SI-NEXT: s_mov_b32 s7, s11 185; SI-NEXT: s_mov_b32 s2, s10 186; SI-NEXT: s_mov_b32 s3, s11 187; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 188; SI-NEXT: s_waitcnt vmcnt(0) 189; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 190; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 191; SI-NEXT: s_waitcnt vmcnt(0) 192; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 193; SI-NEXT: v_rcp_f32_e32 v2, v1 194; SI-NEXT: v_mul_f32_e32 v2, v0, v2 195; SI-NEXT: v_trunc_f32_e32 v2, v2 196; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 197; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 198; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 199; SI-NEXT: s_endpgm 200; 201; CI-LABEL: fast_frem_f16: 202; CI: ; %bb.0: 203; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 204; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 205; CI-NEXT: s_mov_b32 s11, 0xf000 206; CI-NEXT: s_mov_b32 s10, -1 207; CI-NEXT: s_mov_b32 s2, s10 208; CI-NEXT: s_mov_b32 s3, s11 209; CI-NEXT: s_waitcnt lgkmcnt(0) 210; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 211; CI-NEXT: s_mov_b32 s8, s4 212; CI-NEXT: s_mov_b32 s9, s5 213; CI-NEXT: s_mov_b32 s4, s6 214; CI-NEXT: s_mov_b32 s5, s7 215; CI-NEXT: s_mov_b32 s6, s10 216; CI-NEXT: s_mov_b32 s7, s11 217; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 218; CI-NEXT: s_waitcnt vmcnt(1) 219; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 220; CI-NEXT: v_rcp_f32_e32 v2, v1 221; CI-NEXT: s_waitcnt vmcnt(0) 222; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 223; CI-NEXT: v_mul_f32_e32 v2, v0, v2 224; CI-NEXT: v_trunc_f32_e32 v2, v2 225; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 226; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 227; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 228; CI-NEXT: s_endpgm 229; 230; VI-LABEL: fast_frem_f16: 231; VI: ; %bb.0: 232; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 233; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 234; VI-NEXT: s_waitcnt lgkmcnt(0) 235; VI-NEXT: v_mov_b32_e32 v2, s6 236; VI-NEXT: s_add_u32 s0, s0, 8 237; VI-NEXT: v_mov_b32_e32 v3, s7 238; VI-NEXT: s_addc_u32 s1, s1, 0 239; VI-NEXT: flat_load_ushort v4, v[2:3] 240; VI-NEXT: v_mov_b32_e32 v3, s1 241; VI-NEXT: v_mov_b32_e32 v2, s0 242; VI-NEXT: flat_load_ushort v2, v[2:3] 243; VI-NEXT: v_mov_b32_e32 v0, s4 244; VI-NEXT: v_mov_b32_e32 v1, s5 245; VI-NEXT: s_waitcnt vmcnt(0) 246; VI-NEXT: v_rcp_f16_e32 v3, v2 247; VI-NEXT: v_mul_f16_e32 v3, v4, v3 248; VI-NEXT: v_trunc_f16_e32 v3, v3 249; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 250; VI-NEXT: flat_store_short v[0:1], v2 251; VI-NEXT: s_endpgm 252; 253; GFX9-LABEL: fast_frem_f16: 254; GFX9: ; %bb.0: 255; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 256; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 257; GFX9-NEXT: v_mov_b32_e32 v0, 0 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 260; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 261; GFX9-NEXT: s_waitcnt vmcnt(0) 262; GFX9-NEXT: v_rcp_f16_e32 v3, v2 263; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 264; GFX9-NEXT: v_trunc_f16_e32 v3, v3 265; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 266; GFX9-NEXT: global_store_short v0, v1, s[4:5] 267; GFX9-NEXT: s_endpgm 268; 269; GFX10-LABEL: fast_frem_f16: 270; GFX10: ; %bb.0: 271; GFX10-NEXT: s_clause 0x1 272; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 273; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 274; GFX10-NEXT: v_mov_b32_e32 v0, 0 275; GFX10-NEXT: s_waitcnt lgkmcnt(0) 276; GFX10-NEXT: s_clause 0x1 277; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 278; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 279; GFX10-NEXT: s_waitcnt vmcnt(0) 280; GFX10-NEXT: v_rcp_f16_e32 v3, v2 281; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 282; GFX10-NEXT: v_trunc_f16_e32 v3, v3 283; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 284; GFX10-NEXT: global_store_short v0, v1, s[4:5] 285; GFX10-NEXT: s_endpgm 286 half addrspace(1)* %in2) #0 { 287 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 288 %r0 = load half, half addrspace(1)* %in1, align 4 289 %r1 = load half, half addrspace(1)* %gep2, align 4 290 %r2 = frem fast half %r0, %r1 291 store half %r2, half addrspace(1)* %out, align 4 292 ret void 293} 294 295define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 296; SI-LABEL: unsafe_frem_f16: 297; SI: ; %bb.0: 298; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 299; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 300; SI-NEXT: s_mov_b32 s11, 0xf000 301; SI-NEXT: s_mov_b32 s10, -1 302; SI-NEXT: s_waitcnt lgkmcnt(0) 303; SI-NEXT: s_mov_b32 s8, s4 304; SI-NEXT: s_mov_b32 s9, s5 305; SI-NEXT: s_mov_b32 s4, s6 306; SI-NEXT: s_mov_b32 s5, s7 307; SI-NEXT: s_mov_b32 s6, s10 308; SI-NEXT: s_mov_b32 s7, s11 309; SI-NEXT: s_mov_b32 s2, s10 310; SI-NEXT: s_mov_b32 s3, s11 311; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 312; SI-NEXT: s_waitcnt vmcnt(0) 313; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 314; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 315; SI-NEXT: s_waitcnt vmcnt(0) 316; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 317; SI-NEXT: v_rcp_f32_e32 v2, v1 318; SI-NEXT: v_mul_f32_e32 v2, v0, v2 319; SI-NEXT: v_trunc_f32_e32 v2, v2 320; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 321; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 322; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 323; SI-NEXT: s_endpgm 324; 325; CI-LABEL: unsafe_frem_f16: 326; CI: ; %bb.0: 327; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 328; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 329; CI-NEXT: s_mov_b32 s11, 0xf000 330; CI-NEXT: s_mov_b32 s10, -1 331; CI-NEXT: s_mov_b32 s2, s10 332; CI-NEXT: s_mov_b32 s3, s11 333; CI-NEXT: s_waitcnt lgkmcnt(0) 334; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 335; CI-NEXT: s_mov_b32 s8, s4 336; CI-NEXT: s_mov_b32 s9, s5 337; CI-NEXT: s_mov_b32 s4, s6 338; CI-NEXT: s_mov_b32 s5, s7 339; CI-NEXT: s_mov_b32 s6, s10 340; CI-NEXT: s_mov_b32 s7, s11 341; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 342; CI-NEXT: s_waitcnt vmcnt(1) 343; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 344; CI-NEXT: v_rcp_f32_e32 v2, v1 345; CI-NEXT: s_waitcnt vmcnt(0) 346; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 347; CI-NEXT: v_mul_f32_e32 v2, v0, v2 348; CI-NEXT: v_trunc_f32_e32 v2, v2 349; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 350; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 351; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 352; CI-NEXT: s_endpgm 353; 354; VI-LABEL: unsafe_frem_f16: 355; VI: ; %bb.0: 356; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 357; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 358; VI-NEXT: s_waitcnt lgkmcnt(0) 359; VI-NEXT: v_mov_b32_e32 v2, s6 360; VI-NEXT: s_add_u32 s0, s0, 8 361; VI-NEXT: v_mov_b32_e32 v3, s7 362; VI-NEXT: s_addc_u32 s1, s1, 0 363; VI-NEXT: flat_load_ushort v4, v[2:3] 364; VI-NEXT: v_mov_b32_e32 v3, s1 365; VI-NEXT: v_mov_b32_e32 v2, s0 366; VI-NEXT: flat_load_ushort v2, v[2:3] 367; VI-NEXT: v_mov_b32_e32 v0, s4 368; VI-NEXT: v_mov_b32_e32 v1, s5 369; VI-NEXT: s_waitcnt vmcnt(0) 370; VI-NEXT: v_rcp_f16_e32 v3, v2 371; VI-NEXT: v_mul_f16_e32 v3, v4, v3 372; VI-NEXT: v_trunc_f16_e32 v3, v3 373; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 374; VI-NEXT: flat_store_short v[0:1], v2 375; VI-NEXT: s_endpgm 376; 377; GFX9-LABEL: unsafe_frem_f16: 378; GFX9: ; %bb.0: 379; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 380; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 381; GFX9-NEXT: v_mov_b32_e32 v0, 0 382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 383; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] 384; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 385; GFX9-NEXT: s_waitcnt vmcnt(0) 386; GFX9-NEXT: v_rcp_f16_e32 v3, v2 387; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 388; GFX9-NEXT: v_trunc_f16_e32 v3, v3 389; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 390; GFX9-NEXT: global_store_short v0, v1, s[4:5] 391; GFX9-NEXT: s_endpgm 392; 393; GFX10-LABEL: unsafe_frem_f16: 394; GFX10: ; %bb.0: 395; GFX10-NEXT: s_clause 0x1 396; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 397; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 398; GFX10-NEXT: v_mov_b32_e32 v0, 0 399; GFX10-NEXT: s_waitcnt lgkmcnt(0) 400; GFX10-NEXT: s_clause 0x1 401; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] 402; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 403; GFX10-NEXT: s_waitcnt vmcnt(0) 404; GFX10-NEXT: v_rcp_f16_e32 v3, v2 405; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 406; GFX10-NEXT: v_trunc_f16_e32 v3, v3 407; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 408; GFX10-NEXT: global_store_short v0, v1, s[4:5] 409; GFX10-NEXT: s_endpgm 410 half addrspace(1)* %in2) #1 { 411 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 412 %r0 = load half, half addrspace(1)* %in1, align 4 413 %r1 = load half, half addrspace(1)* %gep2, align 4 414 %r2 = frem afn half %r0, %r1 415 store half %r2, half addrspace(1)* %out, align 4 416 ret void 417} 418 419define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 420; SI-LABEL: frem_f32: 421; SI: ; %bb.0: 422; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 423; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 424; SI-NEXT: s_mov_b32 s11, 0xf000 425; SI-NEXT: s_mov_b32 s10, -1 426; SI-NEXT: s_waitcnt lgkmcnt(0) 427; SI-NEXT: s_mov_b32 s8, s4 428; SI-NEXT: s_mov_b32 s9, s5 429; SI-NEXT: s_mov_b32 s4, s6 430; SI-NEXT: s_mov_b32 s5, s7 431; SI-NEXT: s_mov_b32 s6, s10 432; SI-NEXT: s_mov_b32 s7, s11 433; SI-NEXT: s_mov_b32 s2, s10 434; SI-NEXT: s_mov_b32 s3, s11 435; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 436; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 437; SI-NEXT: s_waitcnt vmcnt(0) 438; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 439; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 440; SI-NEXT: v_rcp_f32_e32 v4, v3 441; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 442; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 443; SI-NEXT: v_fma_f32 v4, v5, v4, v4 444; SI-NEXT: v_mul_f32_e32 v5, v2, v4 445; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 446; SI-NEXT: v_fma_f32 v5, v6, v4, v5 447; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 448; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 449; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 450; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 451; SI-NEXT: v_trunc_f32_e32 v2, v2 452; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 453; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 454; SI-NEXT: s_endpgm 455; 456; CI-LABEL: frem_f32: 457; CI: ; %bb.0: 458; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 459; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 460; CI-NEXT: s_mov_b32 s11, 0xf000 461; CI-NEXT: s_mov_b32 s10, -1 462; CI-NEXT: s_mov_b32 s2, s10 463; CI-NEXT: s_waitcnt lgkmcnt(0) 464; CI-NEXT: s_mov_b32 s8, s4 465; CI-NEXT: s_mov_b32 s9, s5 466; CI-NEXT: s_mov_b32 s4, s6 467; CI-NEXT: s_mov_b32 s5, s7 468; CI-NEXT: s_mov_b32 s6, s10 469; CI-NEXT: s_mov_b32 s7, s11 470; CI-NEXT: s_mov_b32 s3, s11 471; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 472; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 473; CI-NEXT: s_waitcnt vmcnt(0) 474; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 475; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 476; CI-NEXT: v_rcp_f32_e32 v4, v3 477; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 478; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 479; CI-NEXT: v_fma_f32 v4, v5, v4, v4 480; CI-NEXT: v_mul_f32_e32 v5, v2, v4 481; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 482; CI-NEXT: v_fma_f32 v5, v6, v4, v5 483; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 484; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 485; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 486; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 487; CI-NEXT: v_trunc_f32_e32 v2, v2 488; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 489; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 490; CI-NEXT: s_endpgm 491; 492; VI-LABEL: frem_f32: 493; VI: ; %bb.0: 494; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 495; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 496; VI-NEXT: s_waitcnt lgkmcnt(0) 497; VI-NEXT: v_mov_b32_e32 v2, s6 498; VI-NEXT: s_add_u32 s0, s0, 16 499; VI-NEXT: v_mov_b32_e32 v3, s7 500; VI-NEXT: s_addc_u32 s1, s1, 0 501; VI-NEXT: flat_load_dword v4, v[2:3] 502; VI-NEXT: v_mov_b32_e32 v3, s1 503; VI-NEXT: v_mov_b32_e32 v2, s0 504; VI-NEXT: flat_load_dword v2, v[2:3] 505; VI-NEXT: v_mov_b32_e32 v0, s4 506; VI-NEXT: v_mov_b32_e32 v1, s5 507; VI-NEXT: s_waitcnt vmcnt(0) 508; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 509; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 510; VI-NEXT: v_rcp_f32_e32 v6, v5 511; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 512; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 513; VI-NEXT: v_fma_f32 v6, v7, v6, v6 514; VI-NEXT: v_mul_f32_e32 v7, v3, v6 515; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 516; VI-NEXT: v_fma_f32 v7, v8, v6, v7 517; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 518; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 519; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 520; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 521; VI-NEXT: v_trunc_f32_e32 v3, v3 522; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 523; VI-NEXT: flat_store_dword v[0:1], v2 524; VI-NEXT: s_endpgm 525; 526; GFX9-LABEL: frem_f32: 527; GFX9: ; %bb.0: 528; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 529; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 530; GFX9-NEXT: v_mov_b32_e32 v0, 0 531; GFX9-NEXT: s_waitcnt lgkmcnt(0) 532; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 533; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 534; GFX9-NEXT: s_waitcnt vmcnt(0) 535; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 536; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 537; GFX9-NEXT: v_rcp_f32_e32 v5, v4 538; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 539; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 540; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 541; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 542; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 543; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 544; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 545; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 546; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 547; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 548; GFX9-NEXT: v_trunc_f32_e32 v3, v3 549; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 550; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 551; GFX9-NEXT: s_endpgm 552; 553; GFX10-LABEL: frem_f32: 554; GFX10: ; %bb.0: 555; GFX10-NEXT: s_clause 0x1 556; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 557; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 558; GFX10-NEXT: v_mov_b32_e32 v0, 0 559; GFX10-NEXT: s_waitcnt lgkmcnt(0) 560; GFX10-NEXT: s_clause 0x1 561; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 562; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 563; GFX10-NEXT: s_waitcnt vmcnt(0) 564; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 565; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 566; GFX10-NEXT: v_rcp_f32_e32 v5, v4 567; GFX10-NEXT: s_denorm_mode 15 568; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 569; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 570; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 571; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 572; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 573; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 574; GFX10-NEXT: s_denorm_mode 12 575; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 576; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 577; GFX10-NEXT: v_trunc_f32_e32 v3, v3 578; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 579; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 580; GFX10-NEXT: s_endpgm 581 float addrspace(1)* %in2) #0 { 582 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 583 %r0 = load float, float addrspace(1)* %in1, align 4 584 %r1 = load float, float addrspace(1)* %gep2, align 4 585 %r2 = frem float %r0, %r1 586 store float %r2, float addrspace(1)* %out, align 4 587 ret void 588} 589 590define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 591; SI-LABEL: fast_frem_f32: 592; SI: ; %bb.0: 593; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 594; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 595; SI-NEXT: s_mov_b32 s11, 0xf000 596; SI-NEXT: s_mov_b32 s10, -1 597; SI-NEXT: s_waitcnt lgkmcnt(0) 598; SI-NEXT: s_mov_b32 s8, s4 599; SI-NEXT: s_mov_b32 s9, s5 600; SI-NEXT: s_mov_b32 s4, s6 601; SI-NEXT: s_mov_b32 s5, s7 602; SI-NEXT: s_mov_b32 s6, s10 603; SI-NEXT: s_mov_b32 s7, s11 604; SI-NEXT: s_mov_b32 s2, s10 605; SI-NEXT: s_mov_b32 s3, s11 606; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 607; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 608; SI-NEXT: s_waitcnt vmcnt(0) 609; SI-NEXT: v_rcp_f32_e32 v2, v1 610; SI-NEXT: v_mul_f32_e32 v2, v0, v2 611; SI-NEXT: v_trunc_f32_e32 v2, v2 612; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 613; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 614; SI-NEXT: s_endpgm 615; 616; CI-LABEL: fast_frem_f32: 617; CI: ; %bb.0: 618; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 619; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 620; CI-NEXT: s_mov_b32 s11, 0xf000 621; CI-NEXT: s_mov_b32 s10, -1 622; CI-NEXT: s_mov_b32 s2, s10 623; CI-NEXT: s_waitcnt lgkmcnt(0) 624; CI-NEXT: s_mov_b32 s8, s4 625; CI-NEXT: s_mov_b32 s9, s5 626; CI-NEXT: s_mov_b32 s4, s6 627; CI-NEXT: s_mov_b32 s5, s7 628; CI-NEXT: s_mov_b32 s6, s10 629; CI-NEXT: s_mov_b32 s7, s11 630; CI-NEXT: s_mov_b32 s3, s11 631; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 632; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 633; CI-NEXT: s_waitcnt vmcnt(0) 634; CI-NEXT: v_rcp_f32_e32 v2, v1 635; CI-NEXT: v_mul_f32_e32 v2, v0, v2 636; CI-NEXT: v_trunc_f32_e32 v2, v2 637; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 638; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 639; CI-NEXT: s_endpgm 640; 641; VI-LABEL: fast_frem_f32: 642; VI: ; %bb.0: 643; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 644; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 645; VI-NEXT: s_waitcnt lgkmcnt(0) 646; VI-NEXT: v_mov_b32_e32 v2, s6 647; VI-NEXT: s_add_u32 s0, s0, 16 648; VI-NEXT: v_mov_b32_e32 v3, s7 649; VI-NEXT: s_addc_u32 s1, s1, 0 650; VI-NEXT: flat_load_dword v4, v[2:3] 651; VI-NEXT: v_mov_b32_e32 v3, s1 652; VI-NEXT: v_mov_b32_e32 v2, s0 653; VI-NEXT: flat_load_dword v2, v[2:3] 654; VI-NEXT: v_mov_b32_e32 v0, s4 655; VI-NEXT: v_mov_b32_e32 v1, s5 656; VI-NEXT: s_waitcnt vmcnt(0) 657; VI-NEXT: v_rcp_f32_e32 v3, v2 658; VI-NEXT: v_mul_f32_e32 v3, v4, v3 659; VI-NEXT: v_trunc_f32_e32 v3, v3 660; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 661; VI-NEXT: flat_store_dword v[0:1], v2 662; VI-NEXT: s_endpgm 663; 664; GFX9-LABEL: fast_frem_f32: 665; GFX9: ; %bb.0: 666; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 667; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 668; GFX9-NEXT: v_mov_b32_e32 v0, 0 669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 670; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 671; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 672; GFX9-NEXT: s_waitcnt vmcnt(0) 673; GFX9-NEXT: v_rcp_f32_e32 v3, v2 674; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 675; GFX9-NEXT: v_trunc_f32_e32 v3, v3 676; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 677; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 678; GFX9-NEXT: s_endpgm 679; 680; GFX10-LABEL: fast_frem_f32: 681; GFX10: ; %bb.0: 682; GFX10-NEXT: s_clause 0x1 683; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 684; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 685; GFX10-NEXT: v_mov_b32_e32 v0, 0 686; GFX10-NEXT: s_waitcnt lgkmcnt(0) 687; GFX10-NEXT: s_clause 0x1 688; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 689; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 690; GFX10-NEXT: s_waitcnt vmcnt(0) 691; GFX10-NEXT: v_rcp_f32_e32 v3, v2 692; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 693; GFX10-NEXT: v_trunc_f32_e32 v3, v3 694; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 695; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 696; GFX10-NEXT: s_endpgm 697 float addrspace(1)* %in2) #0 { 698 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 699 %r0 = load float, float addrspace(1)* %in1, align 4 700 %r1 = load float, float addrspace(1)* %gep2, align 4 701 %r2 = frem fast float %r0, %r1 702 store float %r2, float addrspace(1)* %out, align 4 703 ret void 704} 705 706define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 707; SI-LABEL: unsafe_frem_f32: 708; SI: ; %bb.0: 709; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 710; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 711; SI-NEXT: s_mov_b32 s11, 0xf000 712; SI-NEXT: s_mov_b32 s10, -1 713; SI-NEXT: s_waitcnt lgkmcnt(0) 714; SI-NEXT: s_mov_b32 s8, s4 715; SI-NEXT: s_mov_b32 s9, s5 716; SI-NEXT: s_mov_b32 s4, s6 717; SI-NEXT: s_mov_b32 s5, s7 718; SI-NEXT: s_mov_b32 s6, s10 719; SI-NEXT: s_mov_b32 s7, s11 720; SI-NEXT: s_mov_b32 s2, s10 721; SI-NEXT: s_mov_b32 s3, s11 722; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 723; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 724; SI-NEXT: s_waitcnt vmcnt(0) 725; SI-NEXT: v_rcp_f32_e32 v2, v1 726; SI-NEXT: v_mul_f32_e32 v2, v0, v2 727; SI-NEXT: v_trunc_f32_e32 v2, v2 728; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 729; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 730; SI-NEXT: s_endpgm 731; 732; CI-LABEL: unsafe_frem_f32: 733; CI: ; %bb.0: 734; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 735; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 736; CI-NEXT: s_mov_b32 s11, 0xf000 737; CI-NEXT: s_mov_b32 s10, -1 738; CI-NEXT: s_mov_b32 s2, s10 739; CI-NEXT: s_waitcnt lgkmcnt(0) 740; CI-NEXT: s_mov_b32 s8, s4 741; CI-NEXT: s_mov_b32 s9, s5 742; CI-NEXT: s_mov_b32 s4, s6 743; CI-NEXT: s_mov_b32 s5, s7 744; CI-NEXT: s_mov_b32 s6, s10 745; CI-NEXT: s_mov_b32 s7, s11 746; CI-NEXT: s_mov_b32 s3, s11 747; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 748; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 749; CI-NEXT: s_waitcnt vmcnt(0) 750; CI-NEXT: v_rcp_f32_e32 v2, v1 751; CI-NEXT: v_mul_f32_e32 v2, v0, v2 752; CI-NEXT: v_trunc_f32_e32 v2, v2 753; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 754; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 755; CI-NEXT: s_endpgm 756; 757; VI-LABEL: unsafe_frem_f32: 758; VI: ; %bb.0: 759; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 760; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 761; VI-NEXT: s_waitcnt lgkmcnt(0) 762; VI-NEXT: v_mov_b32_e32 v2, s6 763; VI-NEXT: s_add_u32 s0, s0, 16 764; VI-NEXT: v_mov_b32_e32 v3, s7 765; VI-NEXT: s_addc_u32 s1, s1, 0 766; VI-NEXT: flat_load_dword v4, v[2:3] 767; VI-NEXT: v_mov_b32_e32 v3, s1 768; VI-NEXT: v_mov_b32_e32 v2, s0 769; VI-NEXT: flat_load_dword v2, v[2:3] 770; VI-NEXT: v_mov_b32_e32 v0, s4 771; VI-NEXT: v_mov_b32_e32 v1, s5 772; VI-NEXT: s_waitcnt vmcnt(0) 773; VI-NEXT: v_rcp_f32_e32 v3, v2 774; VI-NEXT: v_mul_f32_e32 v3, v4, v3 775; VI-NEXT: v_trunc_f32_e32 v3, v3 776; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 777; VI-NEXT: flat_store_dword v[0:1], v2 778; VI-NEXT: s_endpgm 779; 780; GFX9-LABEL: unsafe_frem_f32: 781; GFX9: ; %bb.0: 782; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 783; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 784; GFX9-NEXT: v_mov_b32_e32 v0, 0 785; GFX9-NEXT: s_waitcnt lgkmcnt(0) 786; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 787; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 788; GFX9-NEXT: s_waitcnt vmcnt(0) 789; GFX9-NEXT: v_rcp_f32_e32 v3, v2 790; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 791; GFX9-NEXT: v_trunc_f32_e32 v3, v3 792; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 793; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 794; GFX9-NEXT: s_endpgm 795; 796; GFX10-LABEL: unsafe_frem_f32: 797; GFX10: ; %bb.0: 798; GFX10-NEXT: s_clause 0x1 799; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 800; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 801; GFX10-NEXT: v_mov_b32_e32 v0, 0 802; GFX10-NEXT: s_waitcnt lgkmcnt(0) 803; GFX10-NEXT: s_clause 0x1 804; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 805; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 806; GFX10-NEXT: s_waitcnt vmcnt(0) 807; GFX10-NEXT: v_rcp_f32_e32 v3, v2 808; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 809; GFX10-NEXT: v_trunc_f32_e32 v3, v3 810; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 811; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 812; GFX10-NEXT: s_endpgm 813 float addrspace(1)* %in2) #1 { 814 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 815 %r0 = load float, float addrspace(1)* %in1, align 4 816 %r1 = load float, float addrspace(1)* %gep2, align 4 817 %r2 = frem afn float %r0, %r1 818 store float %r2, float addrspace(1)* %out, align 4 819 ret void 820} 821 822define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 823; SI-LABEL: frem_f64: 824; SI: ; %bb.0: 825; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 826; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 827; SI-NEXT: s_mov_b32 s7, 0xf000 828; SI-NEXT: s_mov_b32 s6, -1 829; SI-NEXT: s_waitcnt lgkmcnt(0) 830; SI-NEXT: s_mov_b32 s4, s8 831; SI-NEXT: s_mov_b32 s5, s9 832; SI-NEXT: s_mov_b32 s8, s10 833; SI-NEXT: s_mov_b32 s9, s11 834; SI-NEXT: s_mov_b32 s10, s6 835; SI-NEXT: s_mov_b32 s11, s7 836; SI-NEXT: s_mov_b32 s2, s6 837; SI-NEXT: s_mov_b32 s3, s7 838; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 839; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 840; SI-NEXT: s_waitcnt vmcnt(0) 841; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 842; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 843; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 844; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 845; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 846; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 847; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] 848; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 849; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] 850; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 851; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 852; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 853; SI-NEXT: s_nop 1 854; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] 855; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 856; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 857; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 858; SI-NEXT: s_mov_b32 s1, 0xfffff 859; SI-NEXT: s_mov_b32 s0, s6 860; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 861; SI-NEXT: v_not_b32_e32 v6, v6 862; SI-NEXT: v_and_b32_e32 v6, v4, v6 863; SI-NEXT: v_not_b32_e32 v7, v7 864; SI-NEXT: v_and_b32_e32 v7, v5, v7 865; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 866; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 867; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 868; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 869; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 870; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 871; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 872; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 873; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 874; SI-NEXT: s_endpgm 875; 876; CI-LABEL: frem_f64: 877; CI: ; %bb.0: 878; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 879; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 880; CI-NEXT: s_mov_b32 s11, 0xf000 881; CI-NEXT: s_mov_b32 s10, -1 882; CI-NEXT: s_mov_b32 s2, s10 883; CI-NEXT: s_waitcnt lgkmcnt(0) 884; CI-NEXT: s_mov_b32 s8, s4 885; CI-NEXT: s_mov_b32 s9, s5 886; CI-NEXT: s_mov_b32 s4, s6 887; CI-NEXT: s_mov_b32 s5, s7 888; CI-NEXT: s_mov_b32 s6, s10 889; CI-NEXT: s_mov_b32 s7, s11 890; CI-NEXT: s_mov_b32 s3, s11 891; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 892; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 893; CI-NEXT: s_waitcnt vmcnt(0) 894; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 895; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 896; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 897; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 898; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 899; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 900; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 901; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 902; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 903; CI-NEXT: s_nop 1 904; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 905; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 906; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 907; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 908; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 909; CI-NEXT: s_endpgm 910; 911; VI-LABEL: frem_f64: 912; VI: ; %bb.0: 913; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 914; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 915; VI-NEXT: s_waitcnt lgkmcnt(0) 916; VI-NEXT: v_mov_b32_e32 v2, s6 917; VI-NEXT: v_mov_b32_e32 v3, s7 918; VI-NEXT: v_mov_b32_e32 v4, s0 919; VI-NEXT: v_mov_b32_e32 v5, s1 920; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 921; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 922; VI-NEXT: v_mov_b32_e32 v0, s4 923; VI-NEXT: v_mov_b32_e32 v1, s5 924; VI-NEXT: s_waitcnt vmcnt(0) 925; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] 926; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 927; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 928; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 929; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 930; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 931; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] 932; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 933; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 934; VI-NEXT: s_nop 1 935; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 936; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] 937; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 938; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 939; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 940; VI-NEXT: s_endpgm 941; 942; GFX9-LABEL: frem_f64: 943; GFX9: ; %bb.0: 944; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 945; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 946; GFX9-NEXT: v_mov_b32_e32 v12, 0 947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 948; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 949; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 950; GFX9-NEXT: s_waitcnt vmcnt(0) 951; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 952; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 953; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 954; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 955; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 956; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 957; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 958; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 959; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 960; GFX9-NEXT: s_nop 1 961; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 962; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 963; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 964; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 965; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 966; GFX9-NEXT: s_endpgm 967; 968; GFX10-LABEL: frem_f64: 969; GFX10: ; %bb.0: 970; GFX10-NEXT: s_clause 0x1 971; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 972; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 973; GFX10-NEXT: v_mov_b32_e32 v12, 0 974; GFX10-NEXT: s_waitcnt lgkmcnt(0) 975; GFX10-NEXT: s_clause 0x1 976; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] 977; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] 978; GFX10-NEXT: s_waitcnt vmcnt(0) 979; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] 980; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 981; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 982; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 983; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 984; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 985; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 986; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 987; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 988; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 989; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 990; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 991; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 992; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] 993; GFX10-NEXT: s_endpgm 994 double addrspace(1)* %in2) #0 { 995 %r0 = load double, double addrspace(1)* %in1, align 8 996 %r1 = load double, double addrspace(1)* %in2, align 8 997 %r2 = frem double %r0, %r1 998 store double %r2, double addrspace(1)* %out, align 8 999 ret void 1000} 1001 1002define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1003; SI-LABEL: fast_frem_f64: 1004; SI: ; %bb.0: 1005; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1006; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1007; SI-NEXT: s_mov_b32 s7, 0xf000 1008; SI-NEXT: s_mov_b32 s6, -1 1009; SI-NEXT: s_waitcnt lgkmcnt(0) 1010; SI-NEXT: s_mov_b32 s4, s8 1011; SI-NEXT: s_mov_b32 s5, s9 1012; SI-NEXT: s_mov_b32 s8, s10 1013; SI-NEXT: s_mov_b32 s9, s11 1014; SI-NEXT: s_mov_b32 s10, s6 1015; SI-NEXT: s_mov_b32 s11, s7 1016; SI-NEXT: s_mov_b32 s2, s6 1017; SI-NEXT: s_mov_b32 s3, s7 1018; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1019; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1020; SI-NEXT: s_waitcnt vmcnt(0) 1021; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1022; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1023; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1024; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1025; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1026; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1027; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1028; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1029; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1030; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1031; SI-NEXT: s_mov_b32 s1, 0xfffff 1032; SI-NEXT: s_mov_b32 s0, s6 1033; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1034; SI-NEXT: v_not_b32_e32 v6, v6 1035; SI-NEXT: v_and_b32_e32 v6, v4, v6 1036; SI-NEXT: v_not_b32_e32 v7, v7 1037; SI-NEXT: v_and_b32_e32 v7, v5, v7 1038; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1039; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1040; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1041; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1042; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1043; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1044; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1045; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1046; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1047; SI-NEXT: s_endpgm 1048; 1049; CI-LABEL: fast_frem_f64: 1050; CI: ; %bb.0: 1051; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1052; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1053; CI-NEXT: s_mov_b32 s11, 0xf000 1054; CI-NEXT: s_mov_b32 s10, -1 1055; CI-NEXT: s_mov_b32 s2, s10 1056; CI-NEXT: s_waitcnt lgkmcnt(0) 1057; CI-NEXT: s_mov_b32 s8, s4 1058; CI-NEXT: s_mov_b32 s9, s5 1059; CI-NEXT: s_mov_b32 s4, s6 1060; CI-NEXT: s_mov_b32 s5, s7 1061; CI-NEXT: s_mov_b32 s6, s10 1062; CI-NEXT: s_mov_b32 s7, s11 1063; CI-NEXT: s_mov_b32 s3, s11 1064; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1065; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1066; CI-NEXT: s_waitcnt vmcnt(0) 1067; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1068; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1069; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1070; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1071; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1072; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1073; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1074; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1075; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1076; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1077; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1078; CI-NEXT: s_endpgm 1079; 1080; VI-LABEL: fast_frem_f64: 1081; VI: ; %bb.0: 1082; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1083; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1084; VI-NEXT: s_waitcnt lgkmcnt(0) 1085; VI-NEXT: v_mov_b32_e32 v2, s6 1086; VI-NEXT: v_mov_b32_e32 v3, s7 1087; VI-NEXT: v_mov_b32_e32 v4, s0 1088; VI-NEXT: v_mov_b32_e32 v5, s1 1089; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1090; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1091; VI-NEXT: v_mov_b32_e32 v0, s4 1092; VI-NEXT: v_mov_b32_e32 v1, s5 1093; VI-NEXT: s_waitcnt vmcnt(0) 1094; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1095; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1096; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1097; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1098; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1099; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1100; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1101; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1102; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1103; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1104; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1105; VI-NEXT: s_endpgm 1106; 1107; GFX9-LABEL: fast_frem_f64: 1108; GFX9: ; %bb.0: 1109; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1110; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1111; GFX9-NEXT: v_mov_b32_e32 v10, 0 1112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1114; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1115; GFX9-NEXT: s_waitcnt vmcnt(0) 1116; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1117; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1118; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1119; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1120; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1121; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1122; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1123; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1124; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1125; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1126; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1127; GFX9-NEXT: s_endpgm 1128; 1129; GFX10-LABEL: fast_frem_f64: 1130; GFX10: ; %bb.0: 1131; GFX10-NEXT: s_clause 0x1 1132; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1133; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1134; GFX10-NEXT: v_mov_b32_e32 v10, 0 1135; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX10-NEXT: s_clause 0x1 1137; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1138; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1139; GFX10-NEXT: s_waitcnt vmcnt(0) 1140; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1141; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1142; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1143; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1144; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1145; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1146; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1147; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1148; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1149; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1150; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1151; GFX10-NEXT: s_endpgm 1152 double addrspace(1)* %in2) #0 { 1153 %r0 = load double, double addrspace(1)* %in1, align 8 1154 %r1 = load double, double addrspace(1)* %in2, align 8 1155 %r2 = frem fast double %r0, %r1 1156 store double %r2, double addrspace(1)* %out, align 8 1157 ret void 1158} 1159 1160define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 1161; SI-LABEL: unsafe_frem_f64: 1162; SI: ; %bb.0: 1163; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1164; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1165; SI-NEXT: s_mov_b32 s7, 0xf000 1166; SI-NEXT: s_mov_b32 s6, -1 1167; SI-NEXT: s_waitcnt lgkmcnt(0) 1168; SI-NEXT: s_mov_b32 s4, s8 1169; SI-NEXT: s_mov_b32 s5, s9 1170; SI-NEXT: s_mov_b32 s8, s10 1171; SI-NEXT: s_mov_b32 s9, s11 1172; SI-NEXT: s_mov_b32 s10, s6 1173; SI-NEXT: s_mov_b32 s11, s7 1174; SI-NEXT: s_mov_b32 s2, s6 1175; SI-NEXT: s_mov_b32 s3, s7 1176; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1177; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1178; SI-NEXT: s_waitcnt vmcnt(0) 1179; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1180; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1181; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1182; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1183; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1184; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1185; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1186; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1187; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 1188; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 1189; SI-NEXT: s_mov_b32 s1, 0xfffff 1190; SI-NEXT: s_mov_b32 s0, s6 1191; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 1192; SI-NEXT: v_not_b32_e32 v6, v6 1193; SI-NEXT: v_and_b32_e32 v6, v4, v6 1194; SI-NEXT: v_not_b32_e32 v7, v7 1195; SI-NEXT: v_and_b32_e32 v7, v5, v7 1196; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 1197; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 1198; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 1199; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 1200; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 1201; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1202; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 1203; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1204; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1205; SI-NEXT: s_endpgm 1206; 1207; CI-LABEL: unsafe_frem_f64: 1208; CI: ; %bb.0: 1209; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1210; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1211; CI-NEXT: s_mov_b32 s11, 0xf000 1212; CI-NEXT: s_mov_b32 s10, -1 1213; CI-NEXT: s_mov_b32 s2, s10 1214; CI-NEXT: s_waitcnt lgkmcnt(0) 1215; CI-NEXT: s_mov_b32 s8, s4 1216; CI-NEXT: s_mov_b32 s9, s5 1217; CI-NEXT: s_mov_b32 s4, s6 1218; CI-NEXT: s_mov_b32 s5, s7 1219; CI-NEXT: s_mov_b32 s6, s10 1220; CI-NEXT: s_mov_b32 s7, s11 1221; CI-NEXT: s_mov_b32 s3, s11 1222; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1223; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 1224; CI-NEXT: s_waitcnt vmcnt(0) 1225; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1226; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1227; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1228; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1229; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1230; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1231; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1232; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1233; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1234; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1235; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1236; CI-NEXT: s_endpgm 1237; 1238; VI-LABEL: unsafe_frem_f64: 1239; VI: ; %bb.0: 1240; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1241; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1242; VI-NEXT: s_waitcnt lgkmcnt(0) 1243; VI-NEXT: v_mov_b32_e32 v2, s6 1244; VI-NEXT: v_mov_b32_e32 v3, s7 1245; VI-NEXT: v_mov_b32_e32 v4, s0 1246; VI-NEXT: v_mov_b32_e32 v5, s1 1247; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1248; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1249; VI-NEXT: v_mov_b32_e32 v0, s4 1250; VI-NEXT: v_mov_b32_e32 v1, s5 1251; VI-NEXT: s_waitcnt vmcnt(0) 1252; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1253; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1254; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1255; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1256; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1257; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1258; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1259; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1260; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1261; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1262; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1263; VI-NEXT: s_endpgm 1264; 1265; GFX9-LABEL: unsafe_frem_f64: 1266; GFX9: ; %bb.0: 1267; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1268; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1269; GFX9-NEXT: v_mov_b32_e32 v10, 0 1270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1272; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1273; GFX9-NEXT: s_waitcnt vmcnt(0) 1274; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1275; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1276; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1277; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1278; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1279; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1280; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1281; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1282; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1283; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1284; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1285; GFX9-NEXT: s_endpgm 1286; 1287; GFX10-LABEL: unsafe_frem_f64: 1288; GFX10: ; %bb.0: 1289; GFX10-NEXT: s_clause 0x1 1290; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1291; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1292; GFX10-NEXT: v_mov_b32_e32 v10, 0 1293; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX10-NEXT: s_clause 0x1 1295; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] 1296; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] 1297; GFX10-NEXT: s_waitcnt vmcnt(0) 1298; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1299; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1300; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1301; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1302; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1303; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1304; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1305; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1306; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1307; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1308; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] 1309; GFX10-NEXT: s_endpgm 1310 double addrspace(1)* %in2) #1 { 1311 %r0 = load double, double addrspace(1)* %in1, align 8 1312 %r1 = load double, double addrspace(1)* %in2, align 8 1313 %r2 = frem afn double %r0, %r1 1314 store double %r2, double addrspace(1)* %out, align 8 1315 ret void 1316} 1317 1318define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 1319; SI-LABEL: frem_v2f16: 1320; SI: ; %bb.0: 1321; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1322; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1323; SI-NEXT: s_mov_b32 s3, 0xf000 1324; SI-NEXT: s_mov_b32 s2, -1 1325; SI-NEXT: s_waitcnt lgkmcnt(0) 1326; SI-NEXT: s_mov_b32 s0, s4 1327; SI-NEXT: s_mov_b32 s1, s5 1328; SI-NEXT: s_mov_b32 s4, s6 1329; SI-NEXT: s_mov_b32 s5, s7 1330; SI-NEXT: s_mov_b32 s6, s2 1331; SI-NEXT: s_mov_b32 s7, s3 1332; SI-NEXT: s_mov_b32 s10, s2 1333; SI-NEXT: s_mov_b32 s11, s3 1334; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1335; SI-NEXT: s_waitcnt vmcnt(0) 1336; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1337; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1338; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1339; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1340; SI-NEXT: s_waitcnt vmcnt(0) 1341; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 1342; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1343; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1344; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1345; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1346; SI-NEXT: v_rcp_f32_e32 v6, v5 1347; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1348; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1349; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1350; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1351; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1352; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1353; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1354; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1355; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1356; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1357; SI-NEXT: v_trunc_f32_e32 v4, v4 1358; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 1359; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1360; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1361; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1362; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1363; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1364; SI-NEXT: v_rcp_f32_e32 v5, v4 1365; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1366; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1367; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1368; SI-NEXT: v_mul_f32_e32 v6, v2, v5 1369; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 1370; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1371; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 1372; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1373; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1374; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1375; SI-NEXT: v_trunc_f32_e32 v2, v2 1376; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 1377; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1378; SI-NEXT: v_or_b32_e32 v0, v1, v0 1379; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1380; SI-NEXT: s_endpgm 1381; 1382; CI-LABEL: frem_v2f16: 1383; CI: ; %bb.0: 1384; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1385; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1386; CI-NEXT: s_mov_b32 s3, 0xf000 1387; CI-NEXT: s_mov_b32 s2, -1 1388; CI-NEXT: s_mov_b32 s10, s2 1389; CI-NEXT: s_waitcnt lgkmcnt(0) 1390; CI-NEXT: s_mov_b32 s0, s4 1391; CI-NEXT: s_mov_b32 s1, s5 1392; CI-NEXT: s_mov_b32 s4, s6 1393; CI-NEXT: s_mov_b32 s5, s7 1394; CI-NEXT: s_mov_b32 s6, s2 1395; CI-NEXT: s_mov_b32 s7, s3 1396; CI-NEXT: s_mov_b32 s11, s3 1397; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1398; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1399; CI-NEXT: s_waitcnt vmcnt(1) 1400; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1401; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1402; CI-NEXT: s_waitcnt vmcnt(0) 1403; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 1404; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1405; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1406; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1407; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1408; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1409; CI-NEXT: v_rcp_f32_e32 v6, v5 1410; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1411; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1412; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1413; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1414; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1415; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1416; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1417; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1418; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1419; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1420; CI-NEXT: v_trunc_f32_e32 v4, v4 1421; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 1422; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1423; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1424; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1425; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1426; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1427; CI-NEXT: v_rcp_f32_e32 v5, v4 1428; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1429; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1430; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1431; CI-NEXT: v_mul_f32_e32 v6, v2, v5 1432; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 1433; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1434; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 1435; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1436; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1437; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1438; CI-NEXT: v_trunc_f32_e32 v2, v2 1439; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 1440; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1441; CI-NEXT: v_or_b32_e32 v0, v1, v0 1442; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1443; CI-NEXT: s_endpgm 1444; 1445; VI-LABEL: frem_v2f16: 1446; VI: ; %bb.0: 1447; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1448; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1449; VI-NEXT: s_waitcnt lgkmcnt(0) 1450; VI-NEXT: v_mov_b32_e32 v2, s6 1451; VI-NEXT: s_add_u32 s0, s0, 16 1452; VI-NEXT: v_mov_b32_e32 v3, s7 1453; VI-NEXT: s_addc_u32 s1, s1, 0 1454; VI-NEXT: flat_load_dword v4, v[2:3] 1455; VI-NEXT: v_mov_b32_e32 v3, s1 1456; VI-NEXT: v_mov_b32_e32 v2, s0 1457; VI-NEXT: flat_load_dword v2, v[2:3] 1458; VI-NEXT: v_mov_b32_e32 v0, s4 1459; VI-NEXT: v_mov_b32_e32 v1, s5 1460; VI-NEXT: s_waitcnt vmcnt(1) 1461; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1462; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 1463; VI-NEXT: s_waitcnt vmcnt(0) 1464; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1465; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1466; VI-NEXT: v_rcp_f32_e32 v7, v7 1467; VI-NEXT: v_mul_f32_e32 v5, v5, v7 1468; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1469; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 1470; VI-NEXT: v_trunc_f16_e32 v5, v5 1471; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 1472; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1473; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 1474; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1475; VI-NEXT: v_rcp_f32_e32 v6, v6 1476; VI-NEXT: v_mul_f32_e32 v5, v5, v6 1477; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1478; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 1479; VI-NEXT: v_trunc_f16_e32 v5, v5 1480; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 1481; VI-NEXT: v_or_b32_e32 v2, v2, v3 1482; VI-NEXT: flat_store_dword v[0:1], v2 1483; VI-NEXT: s_endpgm 1484; 1485; GFX9-LABEL: frem_v2f16: 1486; GFX9: ; %bb.0: 1487; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1488; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1489; GFX9-NEXT: v_mov_b32_e32 v0, 0 1490; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 1492; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1493; GFX9-NEXT: s_waitcnt vmcnt(1) 1494; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 1495; GFX9-NEXT: s_waitcnt vmcnt(0) 1496; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 1497; GFX9-NEXT: v_rcp_f32_e32 v4, v4 1498; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 1499; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1500; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1501; GFX9-NEXT: v_trunc_f16_e32 v3, v3 1502; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 1503; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1504; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 1505; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1506; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 1507; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1508; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 1509; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 1510; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1511; GFX9-NEXT: v_trunc_f16_e32 v4, v4 1512; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 1513; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 1514; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 1515; GFX9-NEXT: s_endpgm 1516; 1517; GFX10-LABEL: frem_v2f16: 1518; GFX10: ; %bb.0: 1519; GFX10-NEXT: s_clause 0x1 1520; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1521; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1522; GFX10-NEXT: v_mov_b32_e32 v0, 0 1523; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1524; GFX10-NEXT: s_clause 0x1 1525; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 1526; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 1527; GFX10-NEXT: s_waitcnt vmcnt(1) 1528; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 1529; GFX10-NEXT: s_waitcnt vmcnt(0) 1530; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 1531; GFX10-NEXT: v_rcp_f32_e32 v4, v4 1532; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 1533; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 1534; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 1535; GFX10-NEXT: v_trunc_f16_e32 v3, v3 1536; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 1537; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1538; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1539; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 1540; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 1541; GFX10-NEXT: v_rcp_f32_e32 v5, v5 1542; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 1543; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 1544; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 1545; GFX10-NEXT: v_trunc_f16_e32 v4, v4 1546; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 1547; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 1548; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 1549; GFX10-NEXT: s_endpgm 1550 <2 x half> addrspace(1)* %in2) #0 { 1551 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 1552 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 1553 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8 1554 %r2 = frem <2 x half> %r0, %r1 1555 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8 1556 ret void 1557} 1558 1559define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, 1560; SI-LABEL: frem_v4f16: 1561; SI: ; %bb.0: 1562; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1563; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1564; SI-NEXT: s_mov_b32 s3, 0xf000 1565; SI-NEXT: s_mov_b32 s2, -1 1566; SI-NEXT: s_waitcnt lgkmcnt(0) 1567; SI-NEXT: s_mov_b32 s0, s4 1568; SI-NEXT: s_mov_b32 s1, s5 1569; SI-NEXT: s_mov_b32 s4, s6 1570; SI-NEXT: s_mov_b32 s5, s7 1571; SI-NEXT: s_mov_b32 s6, s2 1572; SI-NEXT: s_mov_b32 s7, s3 1573; SI-NEXT: s_mov_b32 s10, s2 1574; SI-NEXT: s_mov_b32 s11, s3 1575; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1576; SI-NEXT: s_waitcnt vmcnt(0) 1577; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1578; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1579; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 1580; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 1581; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1582; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 1583; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1584; SI-NEXT: s_waitcnt vmcnt(0) 1585; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1586; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1587; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1588; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1589; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1590; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1591; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1592; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1593; SI-NEXT: v_rcp_f32_e32 v10, v9 1594; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1595; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1596; SI-NEXT: v_fma_f32 v10, v11, v10, v10 1597; SI-NEXT: v_mul_f32_e32 v11, v8, v10 1598; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 1599; SI-NEXT: v_fma_f32 v11, v12, v10, v11 1600; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 1601; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1602; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1603; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1604; SI-NEXT: v_trunc_f32_e32 v8, v8 1605; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 1606; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1607; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1608; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1609; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1610; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1611; SI-NEXT: v_rcp_f32_e32 v9, v8 1612; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1613; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1614; SI-NEXT: v_fma_f32 v9, v10, v9, v9 1615; SI-NEXT: v_mul_f32_e32 v10, v5, v9 1616; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 1617; SI-NEXT: v_fma_f32 v10, v11, v9, v10 1618; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 1619; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1620; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1621; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1622; SI-NEXT: v_trunc_f32_e32 v5, v5 1623; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1624; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1625; SI-NEXT: v_or_b32_e32 v1, v4, v1 1626; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1627; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1628; SI-NEXT: v_rcp_f32_e32 v7, v5 1629; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1630; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1631; SI-NEXT: v_fma_f32 v7, v8, v7, v7 1632; SI-NEXT: v_mul_f32_e32 v8, v4, v7 1633; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 1634; SI-NEXT: v_fma_f32 v8, v9, v7, v8 1635; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 1636; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1637; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1638; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1639; SI-NEXT: v_trunc_f32_e32 v4, v4 1640; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 1641; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1642; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1643; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1644; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1645; SI-NEXT: v_rcp_f32_e32 v5, v4 1646; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1647; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1648; SI-NEXT: v_fma_f32 v5, v7, v5, v5 1649; SI-NEXT: v_mul_f32_e32 v7, v3, v5 1650; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 1651; SI-NEXT: v_fma_f32 v7, v8, v5, v7 1652; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 1653; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1654; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1655; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1656; SI-NEXT: v_trunc_f32_e32 v3, v3 1657; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 1658; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1659; SI-NEXT: v_or_b32_e32 v0, v2, v0 1660; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1661; SI-NEXT: s_endpgm 1662; 1663; CI-LABEL: frem_v4f16: 1664; CI: ; %bb.0: 1665; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1666; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1667; CI-NEXT: s_mov_b32 s3, 0xf000 1668; CI-NEXT: s_mov_b32 s2, -1 1669; CI-NEXT: s_mov_b32 s10, s2 1670; CI-NEXT: s_waitcnt lgkmcnt(0) 1671; CI-NEXT: s_mov_b32 s0, s4 1672; CI-NEXT: s_mov_b32 s1, s5 1673; CI-NEXT: s_mov_b32 s4, s6 1674; CI-NEXT: s_mov_b32 s5, s7 1675; CI-NEXT: s_mov_b32 s6, s2 1676; CI-NEXT: s_mov_b32 s7, s3 1677; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1678; CI-NEXT: s_mov_b32 s11, s3 1679; CI-NEXT: s_waitcnt vmcnt(0) 1680; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 1681; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1682; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 1683; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1684; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 1685; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 1686; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1687; CI-NEXT: s_waitcnt vmcnt(0) 1688; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1689; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1690; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1691; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1692; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1693; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1694; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1695; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1696; CI-NEXT: v_rcp_f32_e32 v10, v9 1697; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1698; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1699; CI-NEXT: v_fma_f32 v10, v11, v10, v10 1700; CI-NEXT: v_mul_f32_e32 v11, v8, v10 1701; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 1702; CI-NEXT: v_fma_f32 v11, v12, v10, v11 1703; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 1704; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1705; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1706; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1707; CI-NEXT: v_trunc_f32_e32 v8, v8 1708; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 1709; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1710; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1711; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1712; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1713; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1714; CI-NEXT: v_rcp_f32_e32 v9, v8 1715; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1716; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1717; CI-NEXT: v_fma_f32 v9, v10, v9, v9 1718; CI-NEXT: v_mul_f32_e32 v10, v5, v9 1719; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 1720; CI-NEXT: v_fma_f32 v10, v11, v9, v10 1721; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 1722; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1723; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1724; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1725; CI-NEXT: v_trunc_f32_e32 v5, v5 1726; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1727; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1728; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 1729; CI-NEXT: v_or_b32_e32 v1, v4, v1 1730; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1731; CI-NEXT: v_rcp_f32_e32 v7, v5 1732; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1733; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1734; CI-NEXT: v_fma_f32 v7, v8, v7, v7 1735; CI-NEXT: v_mul_f32_e32 v8, v4, v7 1736; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 1737; CI-NEXT: v_fma_f32 v8, v9, v7, v8 1738; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 1739; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1740; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1741; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1742; CI-NEXT: v_trunc_f32_e32 v4, v4 1743; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 1744; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1745; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1746; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1747; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1748; CI-NEXT: v_rcp_f32_e32 v5, v4 1749; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1750; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1751; CI-NEXT: v_fma_f32 v5, v7, v5, v5 1752; CI-NEXT: v_mul_f32_e32 v7, v3, v5 1753; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 1754; CI-NEXT: v_fma_f32 v7, v8, v5, v7 1755; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 1756; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1757; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1758; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1759; CI-NEXT: v_trunc_f32_e32 v3, v3 1760; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 1761; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1762; CI-NEXT: v_or_b32_e32 v0, v2, v0 1763; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1764; CI-NEXT: s_endpgm 1765; 1766; VI-LABEL: frem_v4f16: 1767; VI: ; %bb.0: 1768; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1769; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1770; VI-NEXT: s_waitcnt lgkmcnt(0) 1771; VI-NEXT: v_mov_b32_e32 v2, s6 1772; VI-NEXT: s_add_u32 s0, s0, 32 1773; VI-NEXT: s_addc_u32 s1, s1, 0 1774; VI-NEXT: v_mov_b32_e32 v5, s1 1775; VI-NEXT: v_mov_b32_e32 v4, s0 1776; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1777; VI-NEXT: v_mov_b32_e32 v3, s7 1778; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1779; VI-NEXT: v_mov_b32_e32 v0, s4 1780; VI-NEXT: v_mov_b32_e32 v1, s5 1781; VI-NEXT: s_waitcnt vmcnt(1) 1782; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 1783; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 1784; VI-NEXT: s_waitcnt vmcnt(0) 1785; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1786; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1787; VI-NEXT: v_rcp_f32_e32 v9, v9 1788; VI-NEXT: v_mul_f32_e32 v7, v7, v9 1789; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1790; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 1791; VI-NEXT: v_trunc_f16_e32 v7, v7 1792; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 1793; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 1794; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 1795; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1796; VI-NEXT: v_rcp_f32_e32 v8, v8 1797; VI-NEXT: v_mul_f32_e32 v7, v7, v8 1798; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1799; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 1800; VI-NEXT: v_trunc_f16_e32 v7, v7 1801; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 1802; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1803; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1804; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1805; VI-NEXT: v_or_b32_e32 v3, v3, v6 1806; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1807; VI-NEXT: v_rcp_f32_e32 v8, v8 1808; VI-NEXT: v_mul_f32_e32 v6, v6, v8 1809; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1810; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 1811; VI-NEXT: v_trunc_f16_e32 v6, v6 1812; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 1813; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 1814; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1815; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1816; VI-NEXT: v_rcp_f32_e32 v7, v7 1817; VI-NEXT: v_mul_f32_e32 v6, v6, v7 1818; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1819; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 1820; VI-NEXT: v_trunc_f16_e32 v6, v6 1821; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 1822; VI-NEXT: v_or_b32_e32 v2, v2, v5 1823; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1824; VI-NEXT: s_endpgm 1825; 1826; GFX9-LABEL: frem_v4f16: 1827; GFX9: ; %bb.0: 1828; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1829; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1830; GFX9-NEXT: v_mov_b32_e32 v4, 0 1831; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1832; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 1833; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 1834; GFX9-NEXT: s_waitcnt vmcnt(1) 1835; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 1836; GFX9-NEXT: s_waitcnt vmcnt(0) 1837; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 1838; GFX9-NEXT: v_rcp_f32_e32 v6, v6 1839; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 1840; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 1841; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 1842; GFX9-NEXT: v_trunc_f16_e32 v5, v5 1843; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 1844; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1845; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 1846; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1847; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 1848; GFX9-NEXT: v_rcp_f32_e32 v7, v7 1849; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 1850; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 1851; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 1852; GFX9-NEXT: v_trunc_f16_e32 v6, v6 1853; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 1854; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 1855; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 1856; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 1857; GFX9-NEXT: v_rcp_f32_e32 v5, v5 1858; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 1859; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 1860; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 1861; GFX9-NEXT: v_trunc_f16_e32 v3, v3 1862; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 1863; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1864; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 1865; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1866; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 1867; GFX9-NEXT: v_rcp_f32_e32 v6, v6 1868; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 1869; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 1870; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 1871; GFX9-NEXT: v_trunc_f16_e32 v5, v5 1872; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 1873; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 1874; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 1875; GFX9-NEXT: s_endpgm 1876; 1877; GFX10-LABEL: frem_v4f16: 1878; GFX10: ; %bb.0: 1879; GFX10-NEXT: s_clause 0x1 1880; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1881; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1882; GFX10-NEXT: v_mov_b32_e32 v4, 0 1883; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1884; GFX10-NEXT: s_clause 0x1 1885; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 1886; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 1887; GFX10-NEXT: s_waitcnt vmcnt(1) 1888; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 1889; GFX10-NEXT: s_waitcnt vmcnt(0) 1890; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 1891; GFX10-NEXT: v_rcp_f32_e32 v6, v6 1892; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 1893; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 1894; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 1895; GFX10-NEXT: v_trunc_f16_e32 v5, v5 1896; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 1897; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1898; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1899; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 1900; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 1901; GFX10-NEXT: v_rcp_f32_e32 v7, v7 1902; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 1903; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 1904; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 1905; GFX10-NEXT: v_trunc_f16_e32 v6, v6 1906; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 1907; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 1908; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 1909; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 1910; GFX10-NEXT: v_rcp_f32_e32 v5, v5 1911; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 1912; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 1913; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 1914; GFX10-NEXT: v_trunc_f16_e32 v3, v3 1915; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 1916; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1917; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1918; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 1919; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 1920; GFX10-NEXT: v_rcp_f32_e32 v6, v6 1921; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 1922; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 1923; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 1924; GFX10-NEXT: v_trunc_f16_e32 v5, v5 1925; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 1926; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 1927; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 1928; GFX10-NEXT: s_endpgm 1929 <4 x half> addrspace(1)* %in2) #0 { 1930 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 1931 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 1932 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16 1933 %r2 = frem <4 x half> %r0, %r1 1934 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16 1935 ret void 1936} 1937 1938define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, 1939; SI-LABEL: frem_v2f32: 1940; SI: ; %bb.0: 1941; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1942; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1943; SI-NEXT: s_mov_b32 s3, 0xf000 1944; SI-NEXT: s_mov_b32 s2, -1 1945; SI-NEXT: s_waitcnt lgkmcnt(0) 1946; SI-NEXT: s_mov_b32 s0, s4 1947; SI-NEXT: s_mov_b32 s1, s5 1948; SI-NEXT: s_mov_b32 s4, s6 1949; SI-NEXT: s_mov_b32 s5, s7 1950; SI-NEXT: s_mov_b32 s6, s2 1951; SI-NEXT: s_mov_b32 s7, s3 1952; SI-NEXT: s_mov_b32 s10, s2 1953; SI-NEXT: s_mov_b32 s11, s3 1954; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1955; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 1956; SI-NEXT: s_waitcnt vmcnt(0) 1957; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 1958; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 1959; SI-NEXT: v_rcp_f32_e32 v6, v5 1960; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1961; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1962; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1963; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1964; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1965; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1966; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1967; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1968; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1969; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 1970; SI-NEXT: v_trunc_f32_e32 v4, v4 1971; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 1972; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 1973; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1974; SI-NEXT: v_rcp_f32_e32 v5, v4 1975; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1976; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1977; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1978; SI-NEXT: v_mul_f32_e32 v6, v3, v5 1979; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 1980; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1981; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 1982; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1983; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 1984; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 1985; SI-NEXT: v_trunc_f32_e32 v3, v3 1986; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 1987; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1988; SI-NEXT: s_endpgm 1989; 1990; CI-LABEL: frem_v2f32: 1991; CI: ; %bb.0: 1992; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1993; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1994; CI-NEXT: s_mov_b32 s3, 0xf000 1995; CI-NEXT: s_mov_b32 s2, -1 1996; CI-NEXT: s_mov_b32 s10, s2 1997; CI-NEXT: s_waitcnt lgkmcnt(0) 1998; CI-NEXT: s_mov_b32 s0, s4 1999; CI-NEXT: s_mov_b32 s1, s5 2000; CI-NEXT: s_mov_b32 s4, s6 2001; CI-NEXT: s_mov_b32 s5, s7 2002; CI-NEXT: s_mov_b32 s6, s2 2003; CI-NEXT: s_mov_b32 s7, s3 2004; CI-NEXT: s_mov_b32 s11, s3 2005; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2006; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 2007; CI-NEXT: s_waitcnt vmcnt(0) 2008; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2009; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2010; CI-NEXT: v_rcp_f32_e32 v6, v5 2011; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2012; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2013; CI-NEXT: v_fma_f32 v6, v7, v6, v6 2014; CI-NEXT: v_mul_f32_e32 v7, v4, v6 2015; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 2016; CI-NEXT: v_fma_f32 v7, v8, v6, v7 2017; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2018; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2019; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2020; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2021; CI-NEXT: v_trunc_f32_e32 v4, v4 2022; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 2023; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2024; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2025; CI-NEXT: v_rcp_f32_e32 v5, v4 2026; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2027; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2028; CI-NEXT: v_fma_f32 v5, v6, v5, v5 2029; CI-NEXT: v_mul_f32_e32 v6, v3, v5 2030; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 2031; CI-NEXT: v_fma_f32 v6, v7, v5, v6 2032; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 2033; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2034; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 2035; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2036; CI-NEXT: v_trunc_f32_e32 v3, v3 2037; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 2038; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2039; CI-NEXT: s_endpgm 2040; 2041; VI-LABEL: frem_v2f32: 2042; VI: ; %bb.0: 2043; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2044; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2045; VI-NEXT: s_waitcnt lgkmcnt(0) 2046; VI-NEXT: v_mov_b32_e32 v2, s6 2047; VI-NEXT: s_add_u32 s0, s0, 32 2048; VI-NEXT: s_addc_u32 s1, s1, 0 2049; VI-NEXT: v_mov_b32_e32 v5, s1 2050; VI-NEXT: v_mov_b32_e32 v3, s7 2051; VI-NEXT: v_mov_b32_e32 v4, s0 2052; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2053; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 2054; VI-NEXT: v_mov_b32_e32 v0, s4 2055; VI-NEXT: v_mov_b32_e32 v1, s5 2056; VI-NEXT: s_waitcnt vmcnt(0) 2057; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 2058; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 2059; VI-NEXT: v_rcp_f32_e32 v8, v7 2060; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2061; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2062; VI-NEXT: v_fma_f32 v8, v9, v8, v8 2063; VI-NEXT: v_mul_f32_e32 v9, v6, v8 2064; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 2065; VI-NEXT: v_fma_f32 v9, v10, v8, v9 2066; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 2067; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2068; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2069; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 2070; VI-NEXT: v_trunc_f32_e32 v6, v6 2071; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 2072; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 2073; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 2074; VI-NEXT: v_rcp_f32_e32 v7, v6 2075; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2076; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2077; VI-NEXT: v_fma_f32 v7, v8, v7, v7 2078; VI-NEXT: v_mul_f32_e32 v8, v5, v7 2079; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 2080; VI-NEXT: v_fma_f32 v8, v9, v7, v8 2081; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 2082; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2083; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2084; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 2085; VI-NEXT: v_trunc_f32_e32 v5, v5 2086; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 2087; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2088; VI-NEXT: s_endpgm 2089; 2090; GFX9-LABEL: frem_v2f32: 2091; GFX9: ; %bb.0: 2092; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2093; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2094; GFX9-NEXT: v_mov_b32_e32 v4, 0 2095; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2097; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2098; GFX9-NEXT: s_waitcnt vmcnt(0) 2099; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 2100; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 2101; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2102; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2103; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2104; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 2105; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 2106; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 2107; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 2108; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 2109; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2110; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2111; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2112; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2113; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 2114; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 2115; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2116; GFX9-NEXT: v_rcp_f32_e32 v6, v5 2117; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2118; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2119; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 2120; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 2121; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 2122; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 2123; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 2124; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2125; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2126; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2127; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2128; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 2129; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2130; GFX9-NEXT: s_endpgm 2131; 2132; GFX10-LABEL: frem_v2f32: 2133; GFX10: ; %bb.0: 2134; GFX10-NEXT: s_clause 0x1 2135; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2136; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2137; GFX10-NEXT: v_mov_b32_e32 v4, 0 2138; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2139; GFX10-NEXT: s_clause 0x1 2140; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] 2141; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 2142; GFX10-NEXT: s_waitcnt vmcnt(0) 2143; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 2144; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 2145; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2146; GFX10-NEXT: s_denorm_mode 15 2147; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2148; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 2149; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 2150; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 2151; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 2152; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 2153; GFX10-NEXT: s_denorm_mode 12 2154; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2155; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 2156; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2157; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 2158; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 2159; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 2160; GFX10-NEXT: v_rcp_f32_e32 v6, v5 2161; GFX10-NEXT: s_denorm_mode 15 2162; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2163; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 2164; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 2165; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 2166; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 2167; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 2168; GFX10-NEXT: s_denorm_mode 12 2169; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 2170; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2171; GFX10-NEXT: v_trunc_f32_e32 v3, v3 2172; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 2173; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 2174; GFX10-NEXT: s_endpgm 2175 <2 x float> addrspace(1)* %in2) #0 { 2176 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 2177 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 2178 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 2179 %r2 = frem <2 x float> %r0, %r1 2180 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 2181 ret void 2182} 2183 2184define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, 2185; SI-LABEL: frem_v4f32: 2186; SI: ; %bb.0: 2187; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2188; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2189; SI-NEXT: s_mov_b32 s3, 0xf000 2190; SI-NEXT: s_mov_b32 s2, -1 2191; SI-NEXT: s_waitcnt lgkmcnt(0) 2192; SI-NEXT: s_mov_b32 s0, s4 2193; SI-NEXT: s_mov_b32 s1, s5 2194; SI-NEXT: s_mov_b32 s4, s6 2195; SI-NEXT: s_mov_b32 s5, s7 2196; SI-NEXT: s_mov_b32 s6, s2 2197; SI-NEXT: s_mov_b32 s7, s3 2198; SI-NEXT: s_mov_b32 s10, s2 2199; SI-NEXT: s_mov_b32 s11, s3 2200; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2201; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2202; SI-NEXT: s_waitcnt vmcnt(0) 2203; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2204; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2205; SI-NEXT: v_rcp_f32_e32 v10, v9 2206; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2207; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2208; SI-NEXT: v_fma_f32 v10, v11, v10, v10 2209; SI-NEXT: v_mul_f32_e32 v11, v8, v10 2210; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 2211; SI-NEXT: v_fma_f32 v11, v12, v10, v11 2212; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 2213; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2214; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2215; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2216; SI-NEXT: v_trunc_f32_e32 v8, v8 2217; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 2218; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2219; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2220; SI-NEXT: v_rcp_f32_e32 v9, v8 2221; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2222; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2223; SI-NEXT: v_fma_f32 v9, v10, v9, v9 2224; SI-NEXT: v_mul_f32_e32 v10, v7, v9 2225; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 2226; SI-NEXT: v_fma_f32 v10, v11, v9, v10 2227; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 2228; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2229; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2230; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2231; SI-NEXT: v_trunc_f32_e32 v7, v7 2232; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 2233; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2234; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2235; SI-NEXT: v_rcp_f32_e32 v8, v7 2236; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2237; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2238; SI-NEXT: v_fma_f32 v8, v9, v8, v8 2239; SI-NEXT: v_mul_f32_e32 v9, v6, v8 2240; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 2241; SI-NEXT: v_fma_f32 v9, v10, v8, v9 2242; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 2243; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2244; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2245; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2246; SI-NEXT: v_trunc_f32_e32 v6, v6 2247; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 2248; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2249; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2250; SI-NEXT: v_rcp_f32_e32 v7, v6 2251; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2252; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2253; SI-NEXT: v_fma_f32 v7, v8, v7, v7 2254; SI-NEXT: v_mul_f32_e32 v8, v5, v7 2255; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 2256; SI-NEXT: v_fma_f32 v8, v9, v7, v8 2257; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 2258; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2259; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2260; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2261; SI-NEXT: v_trunc_f32_e32 v5, v5 2262; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 2263; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2264; SI-NEXT: s_endpgm 2265; 2266; CI-LABEL: frem_v4f32: 2267; CI: ; %bb.0: 2268; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2269; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2270; CI-NEXT: s_mov_b32 s3, 0xf000 2271; CI-NEXT: s_mov_b32 s2, -1 2272; CI-NEXT: s_mov_b32 s10, s2 2273; CI-NEXT: s_waitcnt lgkmcnt(0) 2274; CI-NEXT: s_mov_b32 s0, s4 2275; CI-NEXT: s_mov_b32 s1, s5 2276; CI-NEXT: s_mov_b32 s4, s6 2277; CI-NEXT: s_mov_b32 s5, s7 2278; CI-NEXT: s_mov_b32 s6, s2 2279; CI-NEXT: s_mov_b32 s7, s3 2280; CI-NEXT: s_mov_b32 s11, s3 2281; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2282; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2283; CI-NEXT: s_waitcnt vmcnt(0) 2284; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 2285; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 2286; CI-NEXT: v_rcp_f32_e32 v10, v9 2287; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2288; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2289; CI-NEXT: v_fma_f32 v10, v11, v10, v10 2290; CI-NEXT: v_mul_f32_e32 v11, v8, v10 2291; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 2292; CI-NEXT: v_fma_f32 v11, v12, v10, v11 2293; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 2294; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2295; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2296; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 2297; CI-NEXT: v_trunc_f32_e32 v8, v8 2298; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 2299; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 2300; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2301; CI-NEXT: v_rcp_f32_e32 v9, v8 2302; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2303; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2304; CI-NEXT: v_fma_f32 v9, v10, v9, v9 2305; CI-NEXT: v_mul_f32_e32 v10, v7, v9 2306; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 2307; CI-NEXT: v_fma_f32 v10, v11, v9, v10 2308; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 2309; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2310; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 2311; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2312; CI-NEXT: v_trunc_f32_e32 v7, v7 2313; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 2314; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 2315; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2316; CI-NEXT: v_rcp_f32_e32 v8, v7 2317; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2318; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 2319; CI-NEXT: v_fma_f32 v8, v9, v8, v8 2320; CI-NEXT: v_mul_f32_e32 v9, v6, v8 2321; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 2322; CI-NEXT: v_fma_f32 v9, v10, v8, v9 2323; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 2324; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2325; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 2326; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2327; CI-NEXT: v_trunc_f32_e32 v6, v6 2328; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 2329; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 2330; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2331; CI-NEXT: v_rcp_f32_e32 v7, v6 2332; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2333; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 2334; CI-NEXT: v_fma_f32 v7, v8, v7, v7 2335; CI-NEXT: v_mul_f32_e32 v8, v5, v7 2336; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 2337; CI-NEXT: v_fma_f32 v8, v9, v7, v8 2338; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 2339; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2340; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 2341; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2342; CI-NEXT: v_trunc_f32_e32 v5, v5 2343; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 2344; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2345; CI-NEXT: s_endpgm 2346; 2347; VI-LABEL: frem_v4f32: 2348; VI: ; %bb.0: 2349; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2350; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2351; VI-NEXT: s_waitcnt lgkmcnt(0) 2352; VI-NEXT: v_mov_b32_e32 v0, s6 2353; VI-NEXT: s_add_u32 s0, s0, 64 2354; VI-NEXT: s_addc_u32 s1, s1, 0 2355; VI-NEXT: v_mov_b32_e32 v5, s1 2356; VI-NEXT: v_mov_b32_e32 v1, s7 2357; VI-NEXT: v_mov_b32_e32 v4, s0 2358; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2359; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2360; VI-NEXT: v_mov_b32_e32 v8, s4 2361; VI-NEXT: v_mov_b32_e32 v9, s5 2362; VI-NEXT: s_waitcnt vmcnt(0) 2363; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 2364; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 2365; VI-NEXT: v_rcp_f32_e32 v12, v11 2366; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2367; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 2368; VI-NEXT: v_fma_f32 v12, v13, v12, v12 2369; VI-NEXT: v_mul_f32_e32 v13, v10, v12 2370; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 2371; VI-NEXT: v_fma_f32 v13, v14, v12, v13 2372; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 2373; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2374; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 2375; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 2376; VI-NEXT: v_trunc_f32_e32 v10, v10 2377; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 2378; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 2379; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2380; VI-NEXT: v_rcp_f32_e32 v11, v10 2381; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2382; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2383; VI-NEXT: v_fma_f32 v11, v12, v11, v11 2384; VI-NEXT: v_mul_f32_e32 v12, v7, v11 2385; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 2386; VI-NEXT: v_fma_f32 v12, v13, v11, v12 2387; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 2388; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2389; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 2390; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2391; VI-NEXT: v_trunc_f32_e32 v7, v7 2392; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 2393; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2394; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2395; VI-NEXT: v_rcp_f32_e32 v10, v7 2396; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2397; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 2398; VI-NEXT: v_fma_f32 v10, v11, v10, v10 2399; VI-NEXT: v_mul_f32_e32 v11, v6, v10 2400; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 2401; VI-NEXT: v_fma_f32 v11, v12, v10, v11 2402; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 2403; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2404; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 2405; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2406; VI-NEXT: v_trunc_f32_e32 v6, v6 2407; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 2408; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2409; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2410; VI-NEXT: v_rcp_f32_e32 v7, v6 2411; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2412; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 2413; VI-NEXT: v_fma_f32 v7, v10, v7, v7 2414; VI-NEXT: v_mul_f32_e32 v10, v5, v7 2415; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 2416; VI-NEXT: v_fma_f32 v10, v11, v7, v10 2417; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 2418; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2419; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 2420; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2421; VI-NEXT: v_trunc_f32_e32 v5, v5 2422; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 2423; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2424; VI-NEXT: s_endpgm 2425; 2426; GFX9-LABEL: frem_v4f32: 2427; GFX9: ; %bb.0: 2428; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2429; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2430; GFX9-NEXT: v_mov_b32_e32 v8, 0 2431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2433; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2434; GFX9-NEXT: s_waitcnt vmcnt(0) 2435; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 2436; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 2437; GFX9-NEXT: v_rcp_f32_e32 v11, v10 2438; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2439; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2440; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 2441; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 2442; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 2443; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 2444; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 2445; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2446; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2447; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2448; GFX9-NEXT: v_trunc_f32_e32 v9, v9 2449; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 2450; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 2451; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 2452; GFX9-NEXT: v_rcp_f32_e32 v10, v9 2453; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2454; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2455; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 2456; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 2457; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 2458; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 2459; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 2460; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2461; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2462; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2463; GFX9-NEXT: v_trunc_f32_e32 v7, v7 2464; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 2465; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 2466; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 2467; GFX9-NEXT: v_rcp_f32_e32 v9, v7 2468; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2469; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2470; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 2471; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 2472; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 2473; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 2474; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 2475; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2476; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2477; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2478; GFX9-NEXT: v_trunc_f32_e32 v6, v6 2479; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 2480; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 2481; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 2482; GFX9-NEXT: v_rcp_f32_e32 v7, v6 2483; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2484; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2485; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 2486; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 2487; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 2488; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 2489; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 2490; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2491; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2492; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2493; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2494; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 2495; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2496; GFX9-NEXT: s_endpgm 2497; 2498; GFX10-LABEL: frem_v4f32: 2499; GFX10: ; %bb.0: 2500; GFX10-NEXT: s_clause 0x1 2501; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2502; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2503; GFX10-NEXT: v_mov_b32_e32 v8, 0 2504; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX10-NEXT: s_clause 0x1 2506; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] 2507; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 2508; GFX10-NEXT: s_waitcnt vmcnt(0) 2509; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 2510; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 2511; GFX10-NEXT: v_rcp_f32_e32 v11, v10 2512; GFX10-NEXT: s_denorm_mode 15 2513; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 2514; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 2515; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 2516; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 2517; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 2518; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 2519; GFX10-NEXT: s_denorm_mode 12 2520; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 2521; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 2522; GFX10-NEXT: v_trunc_f32_e32 v9, v9 2523; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 2524; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 2525; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 2526; GFX10-NEXT: v_rcp_f32_e32 v10, v9 2527; GFX10-NEXT: s_denorm_mode 15 2528; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2529; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 2530; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 2531; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 2532; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 2533; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 2534; GFX10-NEXT: s_denorm_mode 12 2535; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 2536; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 2537; GFX10-NEXT: v_trunc_f32_e32 v7, v7 2538; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 2539; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 2540; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 2541; GFX10-NEXT: v_rcp_f32_e32 v9, v7 2542; GFX10-NEXT: s_denorm_mode 15 2543; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 2544; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 2545; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 2546; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 2547; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 2548; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 2549; GFX10-NEXT: s_denorm_mode 12 2550; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 2551; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 2552; GFX10-NEXT: v_trunc_f32_e32 v6, v6 2553; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 2554; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 2555; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 2556; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2557; GFX10-NEXT: s_denorm_mode 15 2558; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 2559; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 2560; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 2561; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 2562; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 2563; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 2564; GFX10-NEXT: s_denorm_mode 12 2565; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 2566; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 2567; GFX10-NEXT: v_trunc_f32_e32 v5, v5 2568; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 2569; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] 2570; GFX10-NEXT: s_endpgm 2571 <4 x float> addrspace(1)* %in2) #0 { 2572 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 2573 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 2574 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 2575 %r2 = frem <4 x float> %r0, %r1 2576 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 2577 ret void 2578} 2579 2580define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, 2581; SI-LABEL: frem_v2f64: 2582; SI: ; %bb.0: 2583; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 2584; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2585; SI-NEXT: s_mov_b32 s7, 0xf000 2586; SI-NEXT: s_mov_b32 s6, -1 2587; SI-NEXT: s_waitcnt lgkmcnt(0) 2588; SI-NEXT: s_mov_b32 s4, s8 2589; SI-NEXT: s_mov_b32 s5, s9 2590; SI-NEXT: s_mov_b32 s8, s10 2591; SI-NEXT: s_mov_b32 s9, s11 2592; SI-NEXT: s_mov_b32 s10, s6 2593; SI-NEXT: s_mov_b32 s11, s7 2594; SI-NEXT: s_mov_b32 s2, s6 2595; SI-NEXT: s_mov_b32 s3, s7 2596; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 2597; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 2598; SI-NEXT: s_waitcnt vmcnt(0) 2599; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 2600; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2601; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2602; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2603; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2604; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2605; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] 2606; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2607; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] 2608; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 2609; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 2610; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 2611; SI-NEXT: s_nop 1 2612; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] 2613; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2614; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 2615; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 2616; SI-NEXT: s_mov_b32 s3, 0xfffff 2617; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 2618; SI-NEXT: v_not_b32_e32 v10, v10 2619; SI-NEXT: v_and_b32_e32 v10, v8, v10 2620; SI-NEXT: v_not_b32_e32 v11, v11 2621; SI-NEXT: v_and_b32_e32 v11, v9, v11 2622; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9 2623; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 2624; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc 2625; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 2626; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] 2627; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 2628; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] 2629; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2630; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2631; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2632; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2633; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2634; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2635; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2636; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] 2637; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2638; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] 2639; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 2640; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 2641; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 2642; SI-NEXT: s_nop 1 2643; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] 2644; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2645; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 2646; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 2647; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 2648; SI-NEXT: v_not_b32_e32 v8, v8 2649; SI-NEXT: v_and_b32_e32 v8, v6, v8 2650; SI-NEXT: v_not_b32_e32 v9, v9 2651; SI-NEXT: v_and_b32_e32 v9, v7, v9 2652; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7 2653; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 2654; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 2655; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 2656; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] 2657; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 2658; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] 2659; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2660; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2661; SI-NEXT: s_endpgm 2662; 2663; CI-LABEL: frem_v2f64: 2664; CI: ; %bb.0: 2665; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2666; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 2667; CI-NEXT: s_mov_b32 s3, 0xf000 2668; CI-NEXT: s_mov_b32 s2, -1 2669; CI-NEXT: s_mov_b32 s10, s2 2670; CI-NEXT: s_waitcnt lgkmcnt(0) 2671; CI-NEXT: s_mov_b32 s0, s4 2672; CI-NEXT: s_mov_b32 s1, s5 2673; CI-NEXT: s_mov_b32 s4, s6 2674; CI-NEXT: s_mov_b32 s5, s7 2675; CI-NEXT: s_mov_b32 s6, s2 2676; CI-NEXT: s_mov_b32 s7, s3 2677; CI-NEXT: s_mov_b32 s11, s3 2678; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2679; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 2680; CI-NEXT: s_waitcnt vmcnt(0) 2681; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] 2682; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2683; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2684; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2685; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2686; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2687; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 2688; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2689; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2690; CI-NEXT: s_nop 1 2691; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2692; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2693; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2694; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2695; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] 2696; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2697; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2698; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2699; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2700; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2701; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 2702; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2703; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2704; CI-NEXT: s_nop 1 2705; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2706; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2707; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2708; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2709; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2710; CI-NEXT: s_endpgm 2711; 2712; VI-LABEL: frem_v2f64: 2713; VI: ; %bb.0: 2714; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2715; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 2716; VI-NEXT: s_waitcnt lgkmcnt(0) 2717; VI-NEXT: v_mov_b32_e32 v0, s6 2718; VI-NEXT: s_add_u32 s0, s0, 64 2719; VI-NEXT: s_addc_u32 s1, s1, 0 2720; VI-NEXT: v_mov_b32_e32 v5, s1 2721; VI-NEXT: v_mov_b32_e32 v1, s7 2722; VI-NEXT: v_mov_b32_e32 v4, s0 2723; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2724; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2725; VI-NEXT: v_mov_b32_e32 v8, s4 2726; VI-NEXT: v_mov_b32_e32 v9, s5 2727; VI-NEXT: s_waitcnt vmcnt(0) 2728; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] 2729; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] 2730; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 2731; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 2732; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 2733; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 2734; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] 2735; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] 2736; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] 2737; VI-NEXT: s_nop 1 2738; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] 2739; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] 2740; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] 2741; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] 2742; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2743; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] 2744; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 2745; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2746; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 2747; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2748; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] 2749; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2750; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] 2751; VI-NEXT: s_nop 1 2752; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] 2753; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2754; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2755; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2756; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2757; VI-NEXT: s_endpgm 2758; 2759; GFX9-LABEL: frem_v2f64: 2760; GFX9: ; %bb.0: 2761; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2762; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2763; GFX9-NEXT: v_mov_b32_e32 v16, 0 2764; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2765; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 2766; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 2767; GFX9-NEXT: s_waitcnt vmcnt(0) 2768; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 2769; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2770; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2771; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2772; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2773; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2774; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 2775; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2776; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2777; GFX9-NEXT: s_nop 1 2778; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2779; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2780; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2781; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2782; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 2783; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2784; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2785; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2786; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2787; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2788; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 2789; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2790; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2791; GFX9-NEXT: s_nop 1 2792; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2793; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2794; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2795; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2796; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 2797; GFX9-NEXT: s_endpgm 2798; 2799; GFX10-LABEL: frem_v2f64: 2800; GFX10: ; %bb.0: 2801; GFX10-NEXT: s_clause 0x1 2802; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2803; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2804; GFX10-NEXT: v_mov_b32_e32 v16, 0 2805; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX10-NEXT: s_clause 0x1 2807; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] 2808; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 2809; GFX10-NEXT: s_waitcnt vmcnt(0) 2810; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] 2811; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 2812; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2813; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2814; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 2815; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 2816; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 2817; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 2818; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 2819; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 2820; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 2821; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 2822; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 2823; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] 2824; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 2825; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2826; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2827; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 2828; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 2829; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 2830; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 2831; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 2832; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 2833; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 2834; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 2835; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 2836; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] 2837; GFX10-NEXT: s_endpgm 2838 <2 x double> addrspace(1)* %in2) #0 { 2839 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 2840 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 2841 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 2842 %r2 = frem <2 x double> %r0, %r1 2843 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 2844 ret void 2845} 2846 2847attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2848attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2849