1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5 6define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 7; SI-LABEL: frem_f16: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 11; SI-NEXT: s_mov_b32 s11, 0xf000 12; SI-NEXT: s_mov_b32 s10, -1 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s8, s4 15; SI-NEXT: s_mov_b32 s9, s5 16; SI-NEXT: s_mov_b32 s4, s6 17; SI-NEXT: s_mov_b32 s5, s7 18; SI-NEXT: s_mov_b32 s6, s10 19; SI-NEXT: s_mov_b32 s7, s11 20; SI-NEXT: s_mov_b32 s2, s10 21; SI-NEXT: s_mov_b32 s3, s11 22; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 25; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 28; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 29; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 30; SI-NEXT: v_rcp_f32_e32 v4, v3 31; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 32; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 33; SI-NEXT: v_fma_f32 v4, v5, v4, v4 34; SI-NEXT: v_mul_f32_e32 v5, v2, v4 35; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 36; SI-NEXT: v_fma_f32 v5, v6, v4, v5 37; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 38; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 39; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 40; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 41; SI-NEXT: v_trunc_f32_e32 v2, v2 42; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 43; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 44; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 45; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 46; SI-NEXT: s_endpgm 47; 48; CI-LABEL: frem_f16: 49; CI: ; %bb.0: 50; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 51; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 52; CI-NEXT: s_mov_b32 s11, 0xf000 53; CI-NEXT: s_mov_b32 s10, -1 54; CI-NEXT: s_mov_b32 s2, s10 55; CI-NEXT: s_waitcnt lgkmcnt(0) 56; CI-NEXT: s_mov_b32 s8, s4 57; CI-NEXT: s_mov_b32 s9, s5 58; CI-NEXT: s_mov_b32 s4, s6 59; CI-NEXT: s_mov_b32 s5, s7 60; CI-NEXT: s_mov_b32 s3, s11 61; CI-NEXT: s_mov_b32 s6, s10 62; CI-NEXT: s_mov_b32 s7, s11 63; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 64; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 65; CI-NEXT: s_waitcnt vmcnt(1) 66; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 67; CI-NEXT: s_waitcnt vmcnt(0) 68; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 69; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 70; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 71; CI-NEXT: v_rcp_f32_e32 v4, v3 72; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 73; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 74; CI-NEXT: v_fma_f32 v4, v5, v4, v4 75; CI-NEXT: v_mul_f32_e32 v5, v2, v4 76; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 77; CI-NEXT: v_fma_f32 v5, v6, v4, v5 78; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 79; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 80; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 81; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 82; CI-NEXT: v_trunc_f32_e32 v2, v2 83; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 84; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 85; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 86; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 87; CI-NEXT: s_endpgm 88; 89; VI-LABEL: frem_f16: 90; VI: ; %bb.0: 91; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 92; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 93; VI-NEXT: s_waitcnt lgkmcnt(0) 94; VI-NEXT: v_mov_b32_e32 v2, s6 95; VI-NEXT: s_add_u32 s0, s0, 8 96; VI-NEXT: v_mov_b32_e32 v3, s7 97; VI-NEXT: s_addc_u32 s1, s1, 0 98; VI-NEXT: flat_load_ushort v4, v[2:3] 99; VI-NEXT: v_mov_b32_e32 v3, s1 100; VI-NEXT: v_mov_b32_e32 v2, s0 101; VI-NEXT: flat_load_ushort v2, v[2:3] 102; VI-NEXT: v_mov_b32_e32 v0, s4 103; VI-NEXT: v_mov_b32_e32 v1, s5 104; VI-NEXT: s_waitcnt vmcnt(1) 105; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 106; VI-NEXT: s_waitcnt vmcnt(0) 107; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 108; VI-NEXT: v_rcp_f32_e32 v5, v5 109; VI-NEXT: v_mul_f32_e32 v3, v3, v5 110; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 111; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 112; VI-NEXT: v_trunc_f16_e32 v3, v3 113; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 114; VI-NEXT: flat_store_short v[0:1], v2 115; VI-NEXT: s_endpgm 116 half addrspace(1)* %in2) #0 { 117 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 118 %r0 = load half, half addrspace(1)* %in1, align 4 119 %r1 = load half, half addrspace(1)* %gep2, align 4 120 %r2 = frem half %r0, %r1 121 store half %r2, half addrspace(1)* %out, align 4 122 ret void 123} 124 125define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 126; SI-LABEL: fast_frem_f16: 127; SI: ; %bb.0: 128; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 129; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 130; SI-NEXT: s_mov_b32 s11, 0xf000 131; SI-NEXT: s_mov_b32 s10, -1 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s8, s4 134; SI-NEXT: s_mov_b32 s9, s5 135; SI-NEXT: s_mov_b32 s4, s6 136; SI-NEXT: s_mov_b32 s5, s7 137; SI-NEXT: s_mov_b32 s6, s10 138; SI-NEXT: s_mov_b32 s7, s11 139; SI-NEXT: s_mov_b32 s2, s10 140; SI-NEXT: s_mov_b32 s3, s11 141; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 142; SI-NEXT: s_waitcnt vmcnt(0) 143; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 144; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 145; SI-NEXT: s_waitcnt vmcnt(0) 146; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 147; SI-NEXT: v_rcp_f32_e32 v2, v1 148; SI-NEXT: v_mul_f32_e32 v2, v0, v2 149; SI-NEXT: v_trunc_f32_e32 v2, v2 150; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 151; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 152; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 153; SI-NEXT: s_endpgm 154; 155; CI-LABEL: fast_frem_f16: 156; CI: ; %bb.0: 157; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 158; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 159; CI-NEXT: s_mov_b32 s11, 0xf000 160; CI-NEXT: s_mov_b32 s10, -1 161; CI-NEXT: s_mov_b32 s2, s10 162; CI-NEXT: s_mov_b32 s3, s11 163; CI-NEXT: s_waitcnt lgkmcnt(0) 164; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 165; CI-NEXT: s_mov_b32 s8, s4 166; CI-NEXT: s_mov_b32 s9, s5 167; CI-NEXT: s_mov_b32 s4, s6 168; CI-NEXT: s_mov_b32 s5, s7 169; CI-NEXT: s_mov_b32 s6, s10 170; CI-NEXT: s_mov_b32 s7, s11 171; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 172; CI-NEXT: s_waitcnt vmcnt(1) 173; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 174; CI-NEXT: v_rcp_f32_e32 v2, v1 175; CI-NEXT: s_waitcnt vmcnt(0) 176; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 177; CI-NEXT: v_mul_f32_e32 v2, v0, v2 178; CI-NEXT: v_trunc_f32_e32 v2, v2 179; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 180; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 181; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 182; CI-NEXT: s_endpgm 183; 184; VI-LABEL: fast_frem_f16: 185; VI: ; %bb.0: 186; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 187; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 188; VI-NEXT: s_waitcnt lgkmcnt(0) 189; VI-NEXT: v_mov_b32_e32 v2, s6 190; VI-NEXT: s_add_u32 s0, s0, 8 191; VI-NEXT: v_mov_b32_e32 v3, s7 192; VI-NEXT: s_addc_u32 s1, s1, 0 193; VI-NEXT: flat_load_ushort v4, v[2:3] 194; VI-NEXT: v_mov_b32_e32 v3, s1 195; VI-NEXT: v_mov_b32_e32 v2, s0 196; VI-NEXT: flat_load_ushort v2, v[2:3] 197; VI-NEXT: v_mov_b32_e32 v0, s4 198; VI-NEXT: v_mov_b32_e32 v1, s5 199; VI-NEXT: s_waitcnt vmcnt(0) 200; VI-NEXT: v_rcp_f16_e32 v3, v2 201; VI-NEXT: v_mul_f16_e32 v3, v4, v3 202; VI-NEXT: v_trunc_f16_e32 v3, v3 203; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 204; VI-NEXT: flat_store_short v[0:1], v2 205; VI-NEXT: s_endpgm 206 half addrspace(1)* %in2) #0 { 207 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 208 %r0 = load half, half addrspace(1)* %in1, align 4 209 %r1 = load half, half addrspace(1)* %gep2, align 4 210 %r2 = frem fast half %r0, %r1 211 store half %r2, half addrspace(1)* %out, align 4 212 ret void 213} 214 215define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, 216; SI-LABEL: unsafe_frem_f16: 217; SI: ; %bb.0: 218; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 219; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 220; SI-NEXT: s_mov_b32 s11, 0xf000 221; SI-NEXT: s_mov_b32 s10, -1 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_mov_b32 s8, s4 224; SI-NEXT: s_mov_b32 s9, s5 225; SI-NEXT: s_mov_b32 s4, s6 226; SI-NEXT: s_mov_b32 s5, s7 227; SI-NEXT: s_mov_b32 s6, s10 228; SI-NEXT: s_mov_b32 s7, s11 229; SI-NEXT: s_mov_b32 s2, s10 230; SI-NEXT: s_mov_b32 s3, s11 231; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 232; SI-NEXT: s_waitcnt vmcnt(0) 233; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 234; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 235; SI-NEXT: s_waitcnt vmcnt(0) 236; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 237; SI-NEXT: v_rcp_f32_e32 v2, v1 238; SI-NEXT: v_mul_f32_e32 v2, v0, v2 239; SI-NEXT: v_trunc_f32_e32 v2, v2 240; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 241; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 242; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 243; SI-NEXT: s_endpgm 244; 245; CI-LABEL: unsafe_frem_f16: 246; CI: ; %bb.0: 247; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 248; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 249; CI-NEXT: s_mov_b32 s11, 0xf000 250; CI-NEXT: s_mov_b32 s10, -1 251; CI-NEXT: s_mov_b32 s2, s10 252; CI-NEXT: s_mov_b32 s3, s11 253; CI-NEXT: s_waitcnt lgkmcnt(0) 254; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 255; CI-NEXT: s_mov_b32 s8, s4 256; CI-NEXT: s_mov_b32 s9, s5 257; CI-NEXT: s_mov_b32 s4, s6 258; CI-NEXT: s_mov_b32 s5, s7 259; CI-NEXT: s_mov_b32 s6, s10 260; CI-NEXT: s_mov_b32 s7, s11 261; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 262; CI-NEXT: s_waitcnt vmcnt(1) 263; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 264; CI-NEXT: v_rcp_f32_e32 v2, v1 265; CI-NEXT: s_waitcnt vmcnt(0) 266; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 267; CI-NEXT: v_mul_f32_e32 v2, v0, v2 268; CI-NEXT: v_trunc_f32_e32 v2, v2 269; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 270; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 271; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 272; CI-NEXT: s_endpgm 273; 274; VI-LABEL: unsafe_frem_f16: 275; VI: ; %bb.0: 276; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 277; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 278; VI-NEXT: s_waitcnt lgkmcnt(0) 279; VI-NEXT: v_mov_b32_e32 v2, s6 280; VI-NEXT: s_add_u32 s0, s0, 8 281; VI-NEXT: v_mov_b32_e32 v3, s7 282; VI-NEXT: s_addc_u32 s1, s1, 0 283; VI-NEXT: flat_load_ushort v4, v[2:3] 284; VI-NEXT: v_mov_b32_e32 v3, s1 285; VI-NEXT: v_mov_b32_e32 v2, s0 286; VI-NEXT: flat_load_ushort v2, v[2:3] 287; VI-NEXT: v_mov_b32_e32 v0, s4 288; VI-NEXT: v_mov_b32_e32 v1, s5 289; VI-NEXT: s_waitcnt vmcnt(0) 290; VI-NEXT: v_rcp_f16_e32 v3, v2 291; VI-NEXT: v_mul_f16_e32 v3, v4, v3 292; VI-NEXT: v_trunc_f16_e32 v3, v3 293; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 294; VI-NEXT: flat_store_short v[0:1], v2 295; VI-NEXT: s_endpgm 296 half addrspace(1)* %in2) #1 { 297 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 298 %r0 = load half, half addrspace(1)* %in1, align 4 299 %r1 = load half, half addrspace(1)* %gep2, align 4 300 %r2 = frem afn half %r0, %r1 301 store half %r2, half addrspace(1)* %out, align 4 302 ret void 303} 304 305define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 306; SI-LABEL: frem_f32: 307; SI: ; %bb.0: 308; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 309; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 310; SI-NEXT: s_mov_b32 s11, 0xf000 311; SI-NEXT: s_mov_b32 s10, -1 312; SI-NEXT: s_waitcnt lgkmcnt(0) 313; SI-NEXT: s_mov_b32 s8, s4 314; SI-NEXT: s_mov_b32 s9, s5 315; SI-NEXT: s_mov_b32 s4, s6 316; SI-NEXT: s_mov_b32 s5, s7 317; SI-NEXT: s_mov_b32 s6, s10 318; SI-NEXT: s_mov_b32 s7, s11 319; SI-NEXT: s_mov_b32 s2, s10 320; SI-NEXT: s_mov_b32 s3, s11 321; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 322; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 323; SI-NEXT: s_waitcnt vmcnt(0) 324; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 325; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 326; SI-NEXT: v_rcp_f32_e32 v4, v3 327; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 328; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 329; SI-NEXT: v_fma_f32 v4, v5, v4, v4 330; SI-NEXT: v_mul_f32_e32 v5, v2, v4 331; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 332; SI-NEXT: v_fma_f32 v5, v6, v4, v5 333; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 334; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 335; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 336; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 337; SI-NEXT: v_trunc_f32_e32 v2, v2 338; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 339; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 340; SI-NEXT: s_endpgm 341; 342; CI-LABEL: frem_f32: 343; CI: ; %bb.0: 344; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 345; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 346; CI-NEXT: s_mov_b32 s11, 0xf000 347; CI-NEXT: s_mov_b32 s10, -1 348; CI-NEXT: s_mov_b32 s2, s10 349; CI-NEXT: s_waitcnt lgkmcnt(0) 350; CI-NEXT: s_mov_b32 s8, s4 351; CI-NEXT: s_mov_b32 s9, s5 352; CI-NEXT: s_mov_b32 s4, s6 353; CI-NEXT: s_mov_b32 s5, s7 354; CI-NEXT: s_mov_b32 s6, s10 355; CI-NEXT: s_mov_b32 s7, s11 356; CI-NEXT: s_mov_b32 s3, s11 357; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 358; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 359; CI-NEXT: s_waitcnt vmcnt(0) 360; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 361; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 362; CI-NEXT: v_rcp_f32_e32 v4, v3 363; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 364; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 365; CI-NEXT: v_fma_f32 v4, v5, v4, v4 366; CI-NEXT: v_mul_f32_e32 v5, v2, v4 367; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 368; CI-NEXT: v_fma_f32 v5, v6, v4, v5 369; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 370; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 371; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 372; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 373; CI-NEXT: v_trunc_f32_e32 v2, v2 374; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 375; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 376; CI-NEXT: s_endpgm 377; 378; VI-LABEL: frem_f32: 379; VI: ; %bb.0: 380; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 381; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 382; VI-NEXT: s_waitcnt lgkmcnt(0) 383; VI-NEXT: v_mov_b32_e32 v2, s6 384; VI-NEXT: s_add_u32 s0, s0, 16 385; VI-NEXT: v_mov_b32_e32 v3, s7 386; VI-NEXT: s_addc_u32 s1, s1, 0 387; VI-NEXT: flat_load_dword v4, v[2:3] 388; VI-NEXT: v_mov_b32_e32 v3, s1 389; VI-NEXT: v_mov_b32_e32 v2, s0 390; VI-NEXT: flat_load_dword v2, v[2:3] 391; VI-NEXT: v_mov_b32_e32 v0, s4 392; VI-NEXT: v_mov_b32_e32 v1, s5 393; VI-NEXT: s_waitcnt vmcnt(0) 394; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 395; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 396; VI-NEXT: v_rcp_f32_e32 v6, v5 397; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 398; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 399; VI-NEXT: v_fma_f32 v6, v7, v6, v6 400; VI-NEXT: v_mul_f32_e32 v7, v3, v6 401; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 402; VI-NEXT: v_fma_f32 v7, v8, v6, v7 403; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 404; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 405; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 406; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 407; VI-NEXT: v_trunc_f32_e32 v3, v3 408; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 409; VI-NEXT: flat_store_dword v[0:1], v2 410; VI-NEXT: s_endpgm 411 float addrspace(1)* %in2) #0 { 412 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 413 %r0 = load float, float addrspace(1)* %in1, align 4 414 %r1 = load float, float addrspace(1)* %gep2, align 4 415 %r2 = frem float %r0, %r1 416 store float %r2, float addrspace(1)* %out, align 4 417 ret void 418} 419 420define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 421; SI-LABEL: fast_frem_f32: 422; SI: ; %bb.0: 423; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 424; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 425; SI-NEXT: s_mov_b32 s11, 0xf000 426; SI-NEXT: s_mov_b32 s10, -1 427; SI-NEXT: s_waitcnt lgkmcnt(0) 428; SI-NEXT: s_mov_b32 s8, s4 429; SI-NEXT: s_mov_b32 s9, s5 430; SI-NEXT: s_mov_b32 s4, s6 431; SI-NEXT: s_mov_b32 s5, s7 432; SI-NEXT: s_mov_b32 s6, s10 433; SI-NEXT: s_mov_b32 s7, s11 434; SI-NEXT: s_mov_b32 s2, s10 435; SI-NEXT: s_mov_b32 s3, s11 436; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 437; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 438; SI-NEXT: s_waitcnt vmcnt(0) 439; SI-NEXT: v_rcp_f32_e32 v2, v1 440; SI-NEXT: v_mul_f32_e32 v2, v0, v2 441; SI-NEXT: v_trunc_f32_e32 v2, v2 442; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 443; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 444; SI-NEXT: s_endpgm 445; 446; CI-LABEL: fast_frem_f32: 447; CI: ; %bb.0: 448; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 449; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 450; CI-NEXT: s_mov_b32 s11, 0xf000 451; CI-NEXT: s_mov_b32 s10, -1 452; CI-NEXT: s_mov_b32 s2, s10 453; CI-NEXT: s_waitcnt lgkmcnt(0) 454; CI-NEXT: s_mov_b32 s8, s4 455; CI-NEXT: s_mov_b32 s9, s5 456; CI-NEXT: s_mov_b32 s4, s6 457; CI-NEXT: s_mov_b32 s5, s7 458; CI-NEXT: s_mov_b32 s6, s10 459; CI-NEXT: s_mov_b32 s7, s11 460; CI-NEXT: s_mov_b32 s3, s11 461; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 462; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 463; CI-NEXT: s_waitcnt vmcnt(0) 464; CI-NEXT: v_rcp_f32_e32 v2, v1 465; CI-NEXT: v_mul_f32_e32 v2, v0, v2 466; CI-NEXT: v_trunc_f32_e32 v2, v2 467; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 468; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 469; CI-NEXT: s_endpgm 470; 471; VI-LABEL: fast_frem_f32: 472; VI: ; %bb.0: 473; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 474; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 475; VI-NEXT: s_waitcnt lgkmcnt(0) 476; VI-NEXT: v_mov_b32_e32 v2, s6 477; VI-NEXT: s_add_u32 s0, s0, 16 478; VI-NEXT: v_mov_b32_e32 v3, s7 479; VI-NEXT: s_addc_u32 s1, s1, 0 480; VI-NEXT: flat_load_dword v4, v[2:3] 481; VI-NEXT: v_mov_b32_e32 v3, s1 482; VI-NEXT: v_mov_b32_e32 v2, s0 483; VI-NEXT: flat_load_dword v2, v[2:3] 484; VI-NEXT: v_mov_b32_e32 v0, s4 485; VI-NEXT: v_mov_b32_e32 v1, s5 486; VI-NEXT: s_waitcnt vmcnt(0) 487; VI-NEXT: v_rcp_f32_e32 v3, v2 488; VI-NEXT: v_mul_f32_e32 v3, v4, v3 489; VI-NEXT: v_trunc_f32_e32 v3, v3 490; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 491; VI-NEXT: flat_store_dword v[0:1], v2 492; VI-NEXT: s_endpgm 493 float addrspace(1)* %in2) #0 { 494 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 495 %r0 = load float, float addrspace(1)* %in1, align 4 496 %r1 = load float, float addrspace(1)* %gep2, align 4 497 %r2 = frem fast float %r0, %r1 498 store float %r2, float addrspace(1)* %out, align 4 499 ret void 500} 501 502define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, 503; SI-LABEL: unsafe_frem_f32: 504; SI: ; %bb.0: 505; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 506; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 507; SI-NEXT: s_mov_b32 s11, 0xf000 508; SI-NEXT: s_mov_b32 s10, -1 509; SI-NEXT: s_waitcnt lgkmcnt(0) 510; SI-NEXT: s_mov_b32 s8, s4 511; SI-NEXT: s_mov_b32 s9, s5 512; SI-NEXT: s_mov_b32 s4, s6 513; SI-NEXT: s_mov_b32 s5, s7 514; SI-NEXT: s_mov_b32 s6, s10 515; SI-NEXT: s_mov_b32 s7, s11 516; SI-NEXT: s_mov_b32 s2, s10 517; SI-NEXT: s_mov_b32 s3, s11 518; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 519; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 520; SI-NEXT: s_waitcnt vmcnt(0) 521; SI-NEXT: v_rcp_f32_e32 v2, v1 522; SI-NEXT: v_mul_f32_e32 v2, v0, v2 523; SI-NEXT: v_trunc_f32_e32 v2, v2 524; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 525; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 526; SI-NEXT: s_endpgm 527; 528; CI-LABEL: unsafe_frem_f32: 529; CI: ; %bb.0: 530; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 531; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 532; CI-NEXT: s_mov_b32 s11, 0xf000 533; CI-NEXT: s_mov_b32 s10, -1 534; CI-NEXT: s_mov_b32 s2, s10 535; CI-NEXT: s_waitcnt lgkmcnt(0) 536; CI-NEXT: s_mov_b32 s8, s4 537; CI-NEXT: s_mov_b32 s9, s5 538; CI-NEXT: s_mov_b32 s4, s6 539; CI-NEXT: s_mov_b32 s5, s7 540; CI-NEXT: s_mov_b32 s6, s10 541; CI-NEXT: s_mov_b32 s7, s11 542; CI-NEXT: s_mov_b32 s3, s11 543; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 544; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 545; CI-NEXT: s_waitcnt vmcnt(0) 546; CI-NEXT: v_rcp_f32_e32 v2, v1 547; CI-NEXT: v_mul_f32_e32 v2, v0, v2 548; CI-NEXT: v_trunc_f32_e32 v2, v2 549; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 550; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 551; CI-NEXT: s_endpgm 552; 553; VI-LABEL: unsafe_frem_f32: 554; VI: ; %bb.0: 555; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 556; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 557; VI-NEXT: s_waitcnt lgkmcnt(0) 558; VI-NEXT: v_mov_b32_e32 v2, s6 559; VI-NEXT: s_add_u32 s0, s0, 16 560; VI-NEXT: v_mov_b32_e32 v3, s7 561; VI-NEXT: s_addc_u32 s1, s1, 0 562; VI-NEXT: flat_load_dword v4, v[2:3] 563; VI-NEXT: v_mov_b32_e32 v3, s1 564; VI-NEXT: v_mov_b32_e32 v2, s0 565; VI-NEXT: flat_load_dword v2, v[2:3] 566; VI-NEXT: v_mov_b32_e32 v0, s4 567; VI-NEXT: v_mov_b32_e32 v1, s5 568; VI-NEXT: s_waitcnt vmcnt(0) 569; VI-NEXT: v_rcp_f32_e32 v3, v2 570; VI-NEXT: v_mul_f32_e32 v3, v4, v3 571; VI-NEXT: v_trunc_f32_e32 v3, v3 572; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 573; VI-NEXT: flat_store_dword v[0:1], v2 574; VI-NEXT: s_endpgm 575 float addrspace(1)* %in2) #1 { 576 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 577 %r0 = load float, float addrspace(1)* %in1, align 4 578 %r1 = load float, float addrspace(1)* %gep2, align 4 579 %r2 = frem afn float %r0, %r1 580 store float %r2, float addrspace(1)* %out, align 4 581 ret void 582} 583 584define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 585; SI-LABEL: frem_f64: 586; SI: ; %bb.0: 587; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 588; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 589; SI-NEXT: s_mov_b32 s7, 0xf000 590; SI-NEXT: s_mov_b32 s6, -1 591; SI-NEXT: s_waitcnt lgkmcnt(0) 592; SI-NEXT: s_mov_b32 s4, s8 593; SI-NEXT: s_mov_b32 s5, s9 594; SI-NEXT: s_mov_b32 s8, s10 595; SI-NEXT: s_mov_b32 s9, s11 596; SI-NEXT: s_mov_b32 s10, s6 597; SI-NEXT: s_mov_b32 s11, s7 598; SI-NEXT: s_mov_b32 s2, s6 599; SI-NEXT: s_mov_b32 s3, s7 600; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 601; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 602; SI-NEXT: s_waitcnt vmcnt(0) 603; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 604; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 605; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 606; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 607; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 608; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 609; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] 610; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 611; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] 612; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 613; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 614; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 615; SI-NEXT: s_nop 1 616; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] 617; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 618; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 619; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 620; SI-NEXT: s_mov_b32 s1, 0xfffff 621; SI-NEXT: s_mov_b32 s0, s6 622; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 623; SI-NEXT: v_not_b32_e32 v6, v6 624; SI-NEXT: v_and_b32_e32 v6, v4, v6 625; SI-NEXT: v_not_b32_e32 v7, v7 626; SI-NEXT: v_and_b32_e32 v7, v5, v7 627; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 628; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 629; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 630; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 631; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 632; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 633; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 634; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 635; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 636; SI-NEXT: s_endpgm 637; 638; CI-LABEL: frem_f64: 639; CI: ; %bb.0: 640; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 641; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 642; CI-NEXT: s_mov_b32 s11, 0xf000 643; CI-NEXT: s_mov_b32 s10, -1 644; CI-NEXT: s_mov_b32 s2, s10 645; CI-NEXT: s_waitcnt lgkmcnt(0) 646; CI-NEXT: s_mov_b32 s8, s4 647; CI-NEXT: s_mov_b32 s9, s5 648; CI-NEXT: s_mov_b32 s4, s6 649; CI-NEXT: s_mov_b32 s5, s7 650; CI-NEXT: s_mov_b32 s6, s10 651; CI-NEXT: s_mov_b32 s7, s11 652; CI-NEXT: s_mov_b32 s3, s11 653; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 654; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 655; CI-NEXT: s_waitcnt vmcnt(0) 656; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 657; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 658; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 659; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 660; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 661; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 662; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 663; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 664; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 665; CI-NEXT: s_nop 1 666; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 667; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 668; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 669; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 670; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 671; CI-NEXT: s_endpgm 672; 673; VI-LABEL: frem_f64: 674; VI: ; %bb.0: 675; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 676; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 677; VI-NEXT: s_waitcnt lgkmcnt(0) 678; VI-NEXT: v_mov_b32_e32 v2, s6 679; VI-NEXT: v_mov_b32_e32 v3, s7 680; VI-NEXT: v_mov_b32_e32 v4, s0 681; VI-NEXT: v_mov_b32_e32 v5, s1 682; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 683; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 684; VI-NEXT: v_mov_b32_e32 v0, s4 685; VI-NEXT: v_mov_b32_e32 v1, s5 686; VI-NEXT: s_waitcnt vmcnt(0) 687; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] 688; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 689; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 690; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 691; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 692; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 693; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] 694; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 695; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 696; VI-NEXT: s_nop 1 697; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 698; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] 699; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 700; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 701; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 702; VI-NEXT: s_endpgm 703 double addrspace(1)* %in2) #0 { 704 %r0 = load double, double addrspace(1)* %in1, align 8 705 %r1 = load double, double addrspace(1)* %in2, align 8 706 %r2 = frem double %r0, %r1 707 store double %r2, double addrspace(1)* %out, align 8 708 ret void 709} 710 711define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 712; SI-LABEL: fast_frem_f64: 713; SI: ; %bb.0: 714; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 715; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 716; SI-NEXT: s_mov_b32 s11, 0xf000 717; SI-NEXT: s_mov_b32 s10, -1 718; SI-NEXT: s_waitcnt lgkmcnt(0) 719; SI-NEXT: s_mov_b32 s8, s4 720; SI-NEXT: s_mov_b32 s9, s5 721; SI-NEXT: s_mov_b32 s4, s6 722; SI-NEXT: s_mov_b32 s5, s7 723; SI-NEXT: s_mov_b32 s6, s10 724; SI-NEXT: s_mov_b32 s7, s11 725; SI-NEXT: s_mov_b32 s2, s10 726; SI-NEXT: s_mov_b32 s3, s11 727; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 728; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 729; SI-NEXT: s_waitcnt vmcnt(0) 730; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 731; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 732; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 733; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 734; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 735; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 736; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 737; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 738; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 739; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 740; SI-NEXT: s_mov_b32 s1, 0xfffff 741; SI-NEXT: s_mov_b32 s0, s10 742; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 743; SI-NEXT: v_not_b32_e32 v6, v6 744; SI-NEXT: v_and_b32_e32 v6, v4, v6 745; SI-NEXT: v_not_b32_e32 v7, v7 746; SI-NEXT: v_and_b32_e32 v7, v5, v7 747; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 748; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 749; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 750; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 751; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 752; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 753; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 754; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 755; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 756; SI-NEXT: s_endpgm 757; 758; CI-LABEL: fast_frem_f64: 759; CI: ; %bb.0: 760; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 761; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 762; CI-NEXT: s_mov_b32 s11, 0xf000 763; CI-NEXT: s_mov_b32 s10, -1 764; CI-NEXT: s_mov_b32 s2, s10 765; CI-NEXT: s_waitcnt lgkmcnt(0) 766; CI-NEXT: s_mov_b32 s8, s4 767; CI-NEXT: s_mov_b32 s9, s5 768; CI-NEXT: s_mov_b32 s4, s6 769; CI-NEXT: s_mov_b32 s5, s7 770; CI-NEXT: s_mov_b32 s6, s10 771; CI-NEXT: s_mov_b32 s7, s11 772; CI-NEXT: s_mov_b32 s3, s11 773; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 774; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 775; CI-NEXT: s_waitcnt vmcnt(0) 776; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 777; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 778; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 779; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 780; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 781; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 782; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 783; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 784; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 785; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 786; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 787; CI-NEXT: s_endpgm 788; 789; VI-LABEL: fast_frem_f64: 790; VI: ; %bb.0: 791; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 792; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 793; VI-NEXT: s_waitcnt lgkmcnt(0) 794; VI-NEXT: v_mov_b32_e32 v2, s6 795; VI-NEXT: v_mov_b32_e32 v3, s7 796; VI-NEXT: v_mov_b32_e32 v4, s0 797; VI-NEXT: v_mov_b32_e32 v5, s1 798; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 799; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 800; VI-NEXT: v_mov_b32_e32 v0, s4 801; VI-NEXT: v_mov_b32_e32 v1, s5 802; VI-NEXT: s_waitcnt vmcnt(0) 803; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 804; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 805; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 806; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 807; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 808; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 809; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 810; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 811; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 812; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 813; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 814; VI-NEXT: s_endpgm 815 double addrspace(1)* %in2) #0 { 816 %r0 = load double, double addrspace(1)* %in1, align 8 817 %r1 = load double, double addrspace(1)* %in2, align 8 818 %r2 = frem fast double %r0, %r1 819 store double %r2, double addrspace(1)* %out, align 8 820 ret void 821} 822 823define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, 824; SI-LABEL: unsafe_frem_f64: 825; SI: ; %bb.0: 826; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 827; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 828; SI-NEXT: s_mov_b32 s11, 0xf000 829; SI-NEXT: s_mov_b32 s10, -1 830; SI-NEXT: s_waitcnt lgkmcnt(0) 831; SI-NEXT: s_mov_b32 s8, s4 832; SI-NEXT: s_mov_b32 s9, s5 833; SI-NEXT: s_mov_b32 s4, s6 834; SI-NEXT: s_mov_b32 s5, s7 835; SI-NEXT: s_mov_b32 s6, s10 836; SI-NEXT: s_mov_b32 s7, s11 837; SI-NEXT: s_mov_b32 s2, s10 838; SI-NEXT: s_mov_b32 s3, s11 839; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 840; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 841; SI-NEXT: s_waitcnt vmcnt(0) 842; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 843; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 844; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 845; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 846; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 847; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 848; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 849; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 850; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 851; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 852; SI-NEXT: s_mov_b32 s1, 0xfffff 853; SI-NEXT: s_mov_b32 s0, s10 854; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 855; SI-NEXT: v_not_b32_e32 v6, v6 856; SI-NEXT: v_and_b32_e32 v6, v4, v6 857; SI-NEXT: v_not_b32_e32 v7, v7 858; SI-NEXT: v_and_b32_e32 v7, v5, v7 859; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 860; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 861; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 862; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 863; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] 864; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 865; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 866; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 867; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 868; SI-NEXT: s_endpgm 869; 870; CI-LABEL: unsafe_frem_f64: 871; CI: ; %bb.0: 872; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 873; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 874; CI-NEXT: s_mov_b32 s11, 0xf000 875; CI-NEXT: s_mov_b32 s10, -1 876; CI-NEXT: s_mov_b32 s2, s10 877; CI-NEXT: s_waitcnt lgkmcnt(0) 878; CI-NEXT: s_mov_b32 s8, s4 879; CI-NEXT: s_mov_b32 s9, s5 880; CI-NEXT: s_mov_b32 s4, s6 881; CI-NEXT: s_mov_b32 s5, s7 882; CI-NEXT: s_mov_b32 s6, s10 883; CI-NEXT: s_mov_b32 s7, s11 884; CI-NEXT: s_mov_b32 s3, s11 885; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 886; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 887; CI-NEXT: s_waitcnt vmcnt(0) 888; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 889; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 890; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 891; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 892; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 893; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 894; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 895; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 896; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 897; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 898; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 899; CI-NEXT: s_endpgm 900; 901; VI-LABEL: unsafe_frem_f64: 902; VI: ; %bb.0: 903; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 904; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 905; VI-NEXT: s_waitcnt lgkmcnt(0) 906; VI-NEXT: v_mov_b32_e32 v2, s6 907; VI-NEXT: v_mov_b32_e32 v3, s7 908; VI-NEXT: v_mov_b32_e32 v4, s0 909; VI-NEXT: v_mov_b32_e32 v5, s1 910; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 911; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 912; VI-NEXT: v_mov_b32_e32 v0, s4 913; VI-NEXT: v_mov_b32_e32 v1, s5 914; VI-NEXT: s_waitcnt vmcnt(0) 915; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 916; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 917; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 918; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 919; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 920; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 921; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 922; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 923; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 924; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 925; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 926; VI-NEXT: s_endpgm 927 double addrspace(1)* %in2) #1 { 928 %r0 = load double, double addrspace(1)* %in1, align 8 929 %r1 = load double, double addrspace(1)* %in2, align 8 930 %r2 = frem afn double %r0, %r1 931 store double %r2, double addrspace(1)* %out, align 8 932 ret void 933} 934 935define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, 936; SI-LABEL: frem_v2f16: 937; SI: ; %bb.0: 938; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 939; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 940; SI-NEXT: s_mov_b32 s3, 0xf000 941; SI-NEXT: s_mov_b32 s2, -1 942; SI-NEXT: s_waitcnt lgkmcnt(0) 943; SI-NEXT: s_mov_b32 s0, s4 944; SI-NEXT: s_mov_b32 s1, s5 945; SI-NEXT: s_mov_b32 s4, s6 946; SI-NEXT: s_mov_b32 s5, s7 947; SI-NEXT: s_mov_b32 s6, s2 948; SI-NEXT: s_mov_b32 s7, s3 949; SI-NEXT: s_mov_b32 s10, s2 950; SI-NEXT: s_mov_b32 s11, s3 951; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 952; SI-NEXT: s_waitcnt vmcnt(0) 953; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 954; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 955; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 956; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 957; SI-NEXT: s_waitcnt vmcnt(0) 958; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 959; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 960; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 961; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 962; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 963; SI-NEXT: v_rcp_f32_e32 v6, v5 964; SI-NEXT: s_mov_b32 s6, 3 965; SI-NEXT: s_mov_b32 s7, 0 966; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 967; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 968; SI-NEXT: v_fma_f32 v6, v7, v6, v6 969; SI-NEXT: v_mul_f32_e32 v7, v4, v6 970; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 971; SI-NEXT: v_fma_f32 v7, v8, v6, v7 972; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 973; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 974; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 975; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 976; SI-NEXT: v_trunc_f32_e32 v4, v4 977; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 978; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 979; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 980; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 981; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 982; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 983; SI-NEXT: v_rcp_f32_e32 v5, v4 984; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 985; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 986; SI-NEXT: v_fma_f32 v5, v6, v5, v5 987; SI-NEXT: v_mul_f32_e32 v6, v2, v5 988; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 989; SI-NEXT: v_fma_f32 v6, v7, v5, v6 990; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 991; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 992; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 993; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 994; SI-NEXT: v_trunc_f32_e32 v2, v2 995; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 996; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 997; SI-NEXT: v_or_b32_e32 v0, v1, v0 998; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 999; SI-NEXT: s_endpgm 1000; 1001; CI-LABEL: frem_v2f16: 1002; CI: ; %bb.0: 1003; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1004; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1005; CI-NEXT: s_mov_b32 s3, 0xf000 1006; CI-NEXT: s_mov_b32 s2, -1 1007; CI-NEXT: s_mov_b32 s10, s2 1008; CI-NEXT: s_waitcnt lgkmcnt(0) 1009; CI-NEXT: s_mov_b32 s0, s4 1010; CI-NEXT: s_mov_b32 s1, s5 1011; CI-NEXT: s_mov_b32 s4, s6 1012; CI-NEXT: s_mov_b32 s5, s7 1013; CI-NEXT: s_mov_b32 s11, s3 1014; CI-NEXT: s_mov_b32 s6, s2 1015; CI-NEXT: s_mov_b32 s7, s3 1016; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1017; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 1018; CI-NEXT: s_mov_b32 s6, 3 1019; CI-NEXT: s_mov_b32 s7, 0 1020; CI-NEXT: s_waitcnt vmcnt(1) 1021; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1022; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1023; CI-NEXT: s_waitcnt vmcnt(0) 1024; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 1025; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1026; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1027; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1028; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1029; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1030; CI-NEXT: v_rcp_f32_e32 v6, v5 1031; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1032; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1033; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1034; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1035; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1036; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1037; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1038; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1039; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1040; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1041; CI-NEXT: v_trunc_f32_e32 v4, v4 1042; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 1043; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1044; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1045; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1046; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1047; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1048; CI-NEXT: v_rcp_f32_e32 v5, v4 1049; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1050; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1051; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1052; CI-NEXT: v_mul_f32_e32 v6, v2, v5 1053; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 1054; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1055; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 1056; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1057; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1058; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1059; CI-NEXT: v_trunc_f32_e32 v2, v2 1060; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 1061; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1062; CI-NEXT: v_or_b32_e32 v0, v1, v0 1063; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1064; CI-NEXT: s_endpgm 1065; 1066; VI-LABEL: frem_v2f16: 1067; VI: ; %bb.0: 1068; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1069; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1070; VI-NEXT: s_waitcnt lgkmcnt(0) 1071; VI-NEXT: v_mov_b32_e32 v2, s6 1072; VI-NEXT: s_add_u32 s0, s0, 16 1073; VI-NEXT: v_mov_b32_e32 v3, s7 1074; VI-NEXT: s_addc_u32 s1, s1, 0 1075; VI-NEXT: flat_load_dword v4, v[2:3] 1076; VI-NEXT: v_mov_b32_e32 v3, s1 1077; VI-NEXT: v_mov_b32_e32 v2, s0 1078; VI-NEXT: flat_load_dword v2, v[2:3] 1079; VI-NEXT: v_mov_b32_e32 v0, s4 1080; VI-NEXT: v_mov_b32_e32 v1, s5 1081; VI-NEXT: s_waitcnt vmcnt(1) 1082; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1083; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 1084; VI-NEXT: s_waitcnt vmcnt(0) 1085; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1086; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1087; VI-NEXT: v_rcp_f32_e32 v7, v7 1088; VI-NEXT: v_mul_f32_e32 v5, v5, v7 1089; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1090; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 1091; VI-NEXT: v_trunc_f16_e32 v5, v5 1092; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 1093; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1094; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 1095; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1096; VI-NEXT: v_rcp_f32_e32 v6, v6 1097; VI-NEXT: v_mul_f32_e32 v5, v5, v6 1098; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 1099; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 1100; VI-NEXT: v_trunc_f16_e32 v5, v5 1101; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 1102; VI-NEXT: v_or_b32_e32 v2, v2, v3 1103; VI-NEXT: flat_store_dword v[0:1], v2 1104; VI-NEXT: s_endpgm 1105 <2 x half> addrspace(1)* %in2) #0 { 1106 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 1107 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 1108 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8 1109 %r2 = frem <2 x half> %r0, %r1 1110 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8 1111 ret void 1112} 1113 1114define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, 1115; SI-LABEL: frem_v4f16: 1116; SI: ; %bb.0: 1117; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1118; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1119; SI-NEXT: s_mov_b32 s3, 0xf000 1120; SI-NEXT: s_mov_b32 s2, -1 1121; SI-NEXT: s_waitcnt lgkmcnt(0) 1122; SI-NEXT: s_mov_b32 s0, s4 1123; SI-NEXT: s_mov_b32 s1, s5 1124; SI-NEXT: s_mov_b32 s4, s6 1125; SI-NEXT: s_mov_b32 s5, s7 1126; SI-NEXT: s_mov_b32 s6, s2 1127; SI-NEXT: s_mov_b32 s7, s3 1128; SI-NEXT: s_mov_b32 s10, s2 1129; SI-NEXT: s_mov_b32 s11, s3 1130; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1131; SI-NEXT: s_waitcnt vmcnt(0) 1132; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1133; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1134; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 1135; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 1136; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1137; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 1138; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1139; SI-NEXT: s_waitcnt vmcnt(0) 1140; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1141; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1142; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1143; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1144; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1145; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1146; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1147; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1148; SI-NEXT: v_rcp_f32_e32 v10, v9 1149; SI-NEXT: s_mov_b32 s6, 3 1150; SI-NEXT: s_mov_b32 s7, 0 1151; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1152; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1153; SI-NEXT: v_fma_f32 v10, v11, v10, v10 1154; SI-NEXT: v_mul_f32_e32 v11, v8, v10 1155; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 1156; SI-NEXT: v_fma_f32 v11, v12, v10, v11 1157; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 1158; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1159; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1160; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1161; SI-NEXT: v_trunc_f32_e32 v8, v8 1162; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 1163; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1164; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1165; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1166; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1167; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1168; SI-NEXT: v_rcp_f32_e32 v9, v8 1169; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1170; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1171; SI-NEXT: v_fma_f32 v9, v10, v9, v9 1172; SI-NEXT: v_mul_f32_e32 v10, v5, v9 1173; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 1174; SI-NEXT: v_fma_f32 v10, v11, v9, v10 1175; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 1176; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1177; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1178; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1179; SI-NEXT: v_trunc_f32_e32 v5, v5 1180; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1181; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1182; SI-NEXT: v_or_b32_e32 v1, v4, v1 1183; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1184; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1185; SI-NEXT: v_rcp_f32_e32 v7, v5 1186; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1187; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1188; SI-NEXT: v_fma_f32 v7, v8, v7, v7 1189; SI-NEXT: v_mul_f32_e32 v8, v4, v7 1190; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 1191; SI-NEXT: v_fma_f32 v8, v9, v7, v8 1192; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 1193; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1194; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1195; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1196; SI-NEXT: v_trunc_f32_e32 v4, v4 1197; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 1198; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1199; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1200; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1201; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1202; SI-NEXT: v_rcp_f32_e32 v5, v4 1203; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1204; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1205; SI-NEXT: v_fma_f32 v5, v7, v5, v5 1206; SI-NEXT: v_mul_f32_e32 v7, v3, v5 1207; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 1208; SI-NEXT: v_fma_f32 v7, v8, v5, v7 1209; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 1210; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1211; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1212; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1213; SI-NEXT: v_trunc_f32_e32 v3, v3 1214; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 1215; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1216; SI-NEXT: v_or_b32_e32 v0, v2, v0 1217; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1218; SI-NEXT: s_endpgm 1219; 1220; CI-LABEL: frem_v4f16: 1221; CI: ; %bb.0: 1222; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1223; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1224; CI-NEXT: s_mov_b32 s3, 0xf000 1225; CI-NEXT: s_mov_b32 s2, -1 1226; CI-NEXT: s_mov_b32 s10, s2 1227; CI-NEXT: s_waitcnt lgkmcnt(0) 1228; CI-NEXT: s_mov_b32 s0, s4 1229; CI-NEXT: s_mov_b32 s1, s5 1230; CI-NEXT: s_mov_b32 s4, s6 1231; CI-NEXT: s_mov_b32 s5, s7 1232; CI-NEXT: s_mov_b32 s6, s2 1233; CI-NEXT: s_mov_b32 s7, s3 1234; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1235; CI-NEXT: s_mov_b32 s11, s3 1236; CI-NEXT: s_mov_b32 s6, 3 1237; CI-NEXT: s_mov_b32 s7, 0 1238; CI-NEXT: s_waitcnt vmcnt(0) 1239; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 1240; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1241; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 1242; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1243; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 1244; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 1245; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 1246; CI-NEXT: s_waitcnt vmcnt(0) 1247; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1248; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1249; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1250; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1251; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1252; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1253; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 1254; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 1255; CI-NEXT: v_rcp_f32_e32 v10, v9 1256; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1257; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1258; CI-NEXT: v_fma_f32 v10, v11, v10, v10 1259; CI-NEXT: v_mul_f32_e32 v11, v8, v10 1260; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 1261; CI-NEXT: v_fma_f32 v11, v12, v10, v11 1262; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 1263; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1264; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1265; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 1266; CI-NEXT: v_trunc_f32_e32 v8, v8 1267; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 1268; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 1269; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 1270; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1271; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1272; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1273; CI-NEXT: v_rcp_f32_e32 v9, v8 1274; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1275; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1276; CI-NEXT: v_fma_f32 v9, v10, v9, v9 1277; CI-NEXT: v_mul_f32_e32 v10, v5, v9 1278; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 1279; CI-NEXT: v_fma_f32 v10, v11, v9, v10 1280; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 1281; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1282; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 1283; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 1284; CI-NEXT: v_trunc_f32_e32 v5, v5 1285; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1286; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 1287; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 1288; CI-NEXT: v_or_b32_e32 v1, v4, v1 1289; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 1290; CI-NEXT: v_rcp_f32_e32 v7, v5 1291; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1292; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 1293; CI-NEXT: v_fma_f32 v7, v8, v7, v7 1294; CI-NEXT: v_mul_f32_e32 v8, v4, v7 1295; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 1296; CI-NEXT: v_fma_f32 v8, v9, v7, v8 1297; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 1298; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1299; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 1300; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 1301; CI-NEXT: v_trunc_f32_e32 v4, v4 1302; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 1303; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 1304; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 1305; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1306; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1307; CI-NEXT: v_rcp_f32_e32 v5, v4 1308; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1309; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 1310; CI-NEXT: v_fma_f32 v5, v7, v5, v5 1311; CI-NEXT: v_mul_f32_e32 v7, v3, v5 1312; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 1313; CI-NEXT: v_fma_f32 v7, v8, v5, v7 1314; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 1315; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1316; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 1317; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 1318; CI-NEXT: v_trunc_f32_e32 v3, v3 1319; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 1320; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1321; CI-NEXT: v_or_b32_e32 v0, v2, v0 1322; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1323; CI-NEXT: s_endpgm 1324; 1325; VI-LABEL: frem_v4f16: 1326; VI: ; %bb.0: 1327; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1328; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1329; VI-NEXT: s_waitcnt lgkmcnt(0) 1330; VI-NEXT: v_mov_b32_e32 v2, s6 1331; VI-NEXT: s_add_u32 s0, s0, 32 1332; VI-NEXT: s_addc_u32 s1, s1, 0 1333; VI-NEXT: v_mov_b32_e32 v5, s1 1334; VI-NEXT: v_mov_b32_e32 v4, s0 1335; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1336; VI-NEXT: v_mov_b32_e32 v3, s7 1337; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1338; VI-NEXT: v_mov_b32_e32 v0, s4 1339; VI-NEXT: v_mov_b32_e32 v1, s5 1340; VI-NEXT: s_waitcnt vmcnt(1) 1341; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 1342; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 1343; VI-NEXT: s_waitcnt vmcnt(0) 1344; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1345; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1346; VI-NEXT: v_rcp_f32_e32 v9, v9 1347; VI-NEXT: v_mul_f32_e32 v7, v7, v9 1348; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1349; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 1350; VI-NEXT: v_trunc_f16_e32 v7, v7 1351; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 1352; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 1353; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 1354; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1355; VI-NEXT: v_rcp_f32_e32 v8, v8 1356; VI-NEXT: v_mul_f32_e32 v7, v7, v8 1357; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 1358; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 1359; VI-NEXT: v_trunc_f16_e32 v7, v7 1360; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 1361; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1362; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1363; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1364; VI-NEXT: v_or_b32_e32 v3, v3, v6 1365; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1366; VI-NEXT: v_rcp_f32_e32 v8, v8 1367; VI-NEXT: v_mul_f32_e32 v6, v6, v8 1368; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1369; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 1370; VI-NEXT: v_trunc_f16_e32 v6, v6 1371; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 1372; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 1373; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 1374; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1375; VI-NEXT: v_rcp_f32_e32 v7, v7 1376; VI-NEXT: v_mul_f32_e32 v6, v6, v7 1377; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1378; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 1379; VI-NEXT: v_trunc_f16_e32 v6, v6 1380; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 1381; VI-NEXT: v_or_b32_e32 v2, v2, v5 1382; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1383; VI-NEXT: s_endpgm 1384 <4 x half> addrspace(1)* %in2) #0 { 1385 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 1386 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 1387 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16 1388 %r2 = frem <4 x half> %r0, %r1 1389 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16 1390 ret void 1391} 1392 1393define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, 1394; SI-LABEL: frem_v2f32: 1395; SI: ; %bb.0: 1396; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1397; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1398; SI-NEXT: s_mov_b32 s3, 0xf000 1399; SI-NEXT: s_mov_b32 s2, -1 1400; SI-NEXT: s_waitcnt lgkmcnt(0) 1401; SI-NEXT: s_mov_b32 s0, s4 1402; SI-NEXT: s_mov_b32 s1, s5 1403; SI-NEXT: s_mov_b32 s4, s6 1404; SI-NEXT: s_mov_b32 s5, s7 1405; SI-NEXT: s_mov_b32 s6, s2 1406; SI-NEXT: s_mov_b32 s7, s3 1407; SI-NEXT: s_mov_b32 s10, s2 1408; SI-NEXT: s_mov_b32 s11, s3 1409; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1410; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 1411; SI-NEXT: s_waitcnt vmcnt(0) 1412; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 1413; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 1414; SI-NEXT: v_rcp_f32_e32 v6, v5 1415; SI-NEXT: s_mov_b32 s6, 3 1416; SI-NEXT: s_mov_b32 s7, 0 1417; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1418; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1419; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1420; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1421; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1422; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1423; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1424; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1425; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1426; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 1427; SI-NEXT: v_trunc_f32_e32 v4, v4 1428; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 1429; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 1430; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1431; SI-NEXT: v_rcp_f32_e32 v5, v4 1432; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1433; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1434; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1435; SI-NEXT: v_mul_f32_e32 v6, v3, v5 1436; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 1437; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1438; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 1439; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1440; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 1441; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 1442; SI-NEXT: v_trunc_f32_e32 v3, v3 1443; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 1444; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1445; SI-NEXT: s_endpgm 1446; 1447; CI-LABEL: frem_v2f32: 1448; CI: ; %bb.0: 1449; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1450; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1451; CI-NEXT: s_mov_b32 s3, 0xf000 1452; CI-NEXT: s_mov_b32 s2, -1 1453; CI-NEXT: s_mov_b32 s10, s2 1454; CI-NEXT: s_waitcnt lgkmcnt(0) 1455; CI-NEXT: s_mov_b32 s0, s4 1456; CI-NEXT: s_mov_b32 s1, s5 1457; CI-NEXT: s_mov_b32 s4, s6 1458; CI-NEXT: s_mov_b32 s5, s7 1459; CI-NEXT: s_mov_b32 s6, s2 1460; CI-NEXT: s_mov_b32 s7, s3 1461; CI-NEXT: s_mov_b32 s11, s3 1462; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1463; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 1464; CI-NEXT: s_mov_b32 s6, 3 1465; CI-NEXT: s_mov_b32 s7, 0 1466; CI-NEXT: s_waitcnt vmcnt(0) 1467; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 1468; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 1469; CI-NEXT: v_rcp_f32_e32 v6, v5 1470; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1471; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1472; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1473; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1474; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1475; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1476; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1477; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1478; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1479; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 1480; CI-NEXT: v_trunc_f32_e32 v4, v4 1481; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 1482; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 1483; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 1484; CI-NEXT: v_rcp_f32_e32 v5, v4 1485; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1486; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1487; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1488; CI-NEXT: v_mul_f32_e32 v6, v3, v5 1489; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 1490; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1491; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 1492; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1493; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 1494; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 1495; CI-NEXT: v_trunc_f32_e32 v3, v3 1496; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 1497; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1498; CI-NEXT: s_endpgm 1499; 1500; VI-LABEL: frem_v2f32: 1501; VI: ; %bb.0: 1502; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1503; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1504; VI-NEXT: s_mov_b32 s2, 3 1505; VI-NEXT: s_mov_b32 s3, 0 1506; VI-NEXT: s_waitcnt lgkmcnt(0) 1507; VI-NEXT: v_mov_b32_e32 v2, s6 1508; VI-NEXT: s_add_u32 s0, s0, 32 1509; VI-NEXT: s_addc_u32 s1, s1, 0 1510; VI-NEXT: v_mov_b32_e32 v5, s1 1511; VI-NEXT: v_mov_b32_e32 v3, s7 1512; VI-NEXT: v_mov_b32_e32 v4, s0 1513; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1514; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1515; VI-NEXT: v_mov_b32_e32 v0, s4 1516; VI-NEXT: v_mov_b32_e32 v1, s5 1517; VI-NEXT: s_waitcnt vmcnt(0) 1518; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 1519; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 1520; VI-NEXT: v_rcp_f32_e32 v8, v7 1521; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1522; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 1523; VI-NEXT: v_fma_f32 v8, v9, v8, v8 1524; VI-NEXT: v_mul_f32_e32 v9, v6, v8 1525; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 1526; VI-NEXT: v_fma_f32 v9, v10, v8, v9 1527; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 1528; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1529; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 1530; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 1531; VI-NEXT: v_trunc_f32_e32 v6, v6 1532; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 1533; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 1534; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 1535; VI-NEXT: v_rcp_f32_e32 v7, v6 1536; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1537; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 1538; VI-NEXT: v_fma_f32 v7, v8, v7, v7 1539; VI-NEXT: v_mul_f32_e32 v8, v5, v7 1540; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 1541; VI-NEXT: v_fma_f32 v8, v9, v7, v8 1542; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 1543; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1544; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 1545; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 1546; VI-NEXT: v_trunc_f32_e32 v5, v5 1547; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 1548; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1549; VI-NEXT: s_endpgm 1550 <2 x float> addrspace(1)* %in2) #0 { 1551 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 1552 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 1553 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 1554 %r2 = frem <2 x float> %r0, %r1 1555 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 1556 ret void 1557} 1558 1559define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, 1560; SI-LABEL: frem_v4f32: 1561; SI: ; %bb.0: 1562; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1563; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1564; SI-NEXT: s_mov_b32 s3, 0xf000 1565; SI-NEXT: s_mov_b32 s2, -1 1566; SI-NEXT: s_waitcnt lgkmcnt(0) 1567; SI-NEXT: s_mov_b32 s0, s4 1568; SI-NEXT: s_mov_b32 s1, s5 1569; SI-NEXT: s_mov_b32 s4, s6 1570; SI-NEXT: s_mov_b32 s5, s7 1571; SI-NEXT: s_mov_b32 s6, s2 1572; SI-NEXT: s_mov_b32 s7, s3 1573; SI-NEXT: s_mov_b32 s10, s2 1574; SI-NEXT: s_mov_b32 s11, s3 1575; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1576; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 1577; SI-NEXT: s_waitcnt vmcnt(0) 1578; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 1579; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 1580; SI-NEXT: v_rcp_f32_e32 v10, v9 1581; SI-NEXT: s_mov_b32 s6, 3 1582; SI-NEXT: s_mov_b32 s7, 0 1583; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1584; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1585; SI-NEXT: v_fma_f32 v10, v11, v10, v10 1586; SI-NEXT: v_mul_f32_e32 v11, v8, v10 1587; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 1588; SI-NEXT: v_fma_f32 v11, v12, v10, v11 1589; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 1590; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1591; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1592; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 1593; SI-NEXT: v_trunc_f32_e32 v8, v8 1594; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 1595; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 1596; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 1597; SI-NEXT: v_rcp_f32_e32 v9, v8 1598; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1599; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1600; SI-NEXT: v_fma_f32 v9, v10, v9, v9 1601; SI-NEXT: v_mul_f32_e32 v10, v7, v9 1602; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 1603; SI-NEXT: v_fma_f32 v10, v11, v9, v10 1604; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 1605; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1606; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 1607; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 1608; SI-NEXT: v_trunc_f32_e32 v7, v7 1609; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 1610; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 1611; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 1612; SI-NEXT: v_rcp_f32_e32 v8, v7 1613; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1614; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 1615; SI-NEXT: v_fma_f32 v8, v9, v8, v8 1616; SI-NEXT: v_mul_f32_e32 v9, v6, v8 1617; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 1618; SI-NEXT: v_fma_f32 v9, v10, v8, v9 1619; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 1620; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1621; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 1622; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 1623; SI-NEXT: v_trunc_f32_e32 v6, v6 1624; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 1625; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 1626; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 1627; SI-NEXT: v_rcp_f32_e32 v7, v6 1628; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1629; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 1630; SI-NEXT: v_fma_f32 v7, v8, v7, v7 1631; SI-NEXT: v_mul_f32_e32 v8, v5, v7 1632; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 1633; SI-NEXT: v_fma_f32 v8, v9, v7, v8 1634; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 1635; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1636; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 1637; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 1638; SI-NEXT: v_trunc_f32_e32 v5, v5 1639; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 1640; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1641; SI-NEXT: s_endpgm 1642; 1643; CI-LABEL: frem_v4f32: 1644; CI: ; %bb.0: 1645; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1646; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1647; CI-NEXT: s_mov_b32 s3, 0xf000 1648; CI-NEXT: s_mov_b32 s2, -1 1649; CI-NEXT: s_mov_b32 s10, s2 1650; CI-NEXT: s_waitcnt lgkmcnt(0) 1651; CI-NEXT: s_mov_b32 s0, s4 1652; CI-NEXT: s_mov_b32 s1, s5 1653; CI-NEXT: s_mov_b32 s4, s6 1654; CI-NEXT: s_mov_b32 s5, s7 1655; CI-NEXT: s_mov_b32 s6, s2 1656; CI-NEXT: s_mov_b32 s7, s3 1657; CI-NEXT: s_mov_b32 s11, s3 1658; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1659; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 1660; CI-NEXT: s_mov_b32 s6, 3 1661; CI-NEXT: s_mov_b32 s7, 0 1662; CI-NEXT: s_waitcnt vmcnt(0) 1663; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 1664; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 1665; CI-NEXT: v_rcp_f32_e32 v10, v9 1666; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1667; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 1668; CI-NEXT: v_fma_f32 v10, v11, v10, v10 1669; CI-NEXT: v_mul_f32_e32 v11, v8, v10 1670; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 1671; CI-NEXT: v_fma_f32 v11, v12, v10, v11 1672; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 1673; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1674; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 1675; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 1676; CI-NEXT: v_trunc_f32_e32 v8, v8 1677; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 1678; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 1679; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 1680; CI-NEXT: v_rcp_f32_e32 v9, v8 1681; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1682; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 1683; CI-NEXT: v_fma_f32 v9, v10, v9, v9 1684; CI-NEXT: v_mul_f32_e32 v10, v7, v9 1685; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 1686; CI-NEXT: v_fma_f32 v10, v11, v9, v10 1687; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 1688; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1689; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 1690; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 1691; CI-NEXT: v_trunc_f32_e32 v7, v7 1692; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 1693; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 1694; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 1695; CI-NEXT: v_rcp_f32_e32 v8, v7 1696; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1697; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 1698; CI-NEXT: v_fma_f32 v8, v9, v8, v8 1699; CI-NEXT: v_mul_f32_e32 v9, v6, v8 1700; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 1701; CI-NEXT: v_fma_f32 v9, v10, v8, v9 1702; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 1703; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1704; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 1705; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 1706; CI-NEXT: v_trunc_f32_e32 v6, v6 1707; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 1708; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 1709; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 1710; CI-NEXT: v_rcp_f32_e32 v7, v6 1711; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 1712; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 1713; CI-NEXT: v_fma_f32 v7, v8, v7, v7 1714; CI-NEXT: v_mul_f32_e32 v8, v5, v7 1715; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 1716; CI-NEXT: v_fma_f32 v8, v9, v7, v8 1717; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 1718; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 1719; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 1720; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 1721; CI-NEXT: v_trunc_f32_e32 v5, v5 1722; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 1723; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1724; CI-NEXT: s_endpgm 1725; 1726; VI-LABEL: frem_v4f32: 1727; VI: ; %bb.0: 1728; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1729; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1730; VI-NEXT: s_mov_b32 s2, 3 1731; VI-NEXT: s_mov_b32 s3, 0 1732; VI-NEXT: s_waitcnt lgkmcnt(0) 1733; VI-NEXT: v_mov_b32_e32 v0, s6 1734; VI-NEXT: s_add_u32 s0, s0, 64 1735; VI-NEXT: s_addc_u32 s1, s1, 0 1736; VI-NEXT: v_mov_b32_e32 v5, s1 1737; VI-NEXT: v_mov_b32_e32 v1, s7 1738; VI-NEXT: v_mov_b32_e32 v4, s0 1739; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1740; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1741; VI-NEXT: v_mov_b32_e32 v8, s4 1742; VI-NEXT: v_mov_b32_e32 v9, s5 1743; VI-NEXT: s_waitcnt vmcnt(0) 1744; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 1745; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 1746; VI-NEXT: v_rcp_f32_e32 v12, v11 1747; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1748; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 1749; VI-NEXT: v_fma_f32 v12, v13, v12, v12 1750; VI-NEXT: v_mul_f32_e32 v13, v10, v12 1751; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 1752; VI-NEXT: v_fma_f32 v13, v14, v12, v13 1753; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 1754; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1755; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 1756; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 1757; VI-NEXT: v_trunc_f32_e32 v10, v10 1758; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 1759; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 1760; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 1761; VI-NEXT: v_rcp_f32_e32 v11, v10 1762; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1763; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 1764; VI-NEXT: v_fma_f32 v11, v12, v11, v11 1765; VI-NEXT: v_mul_f32_e32 v12, v7, v11 1766; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 1767; VI-NEXT: v_fma_f32 v12, v13, v11, v12 1768; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 1769; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1770; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 1771; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 1772; VI-NEXT: v_trunc_f32_e32 v7, v7 1773; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 1774; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 1775; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 1776; VI-NEXT: v_rcp_f32_e32 v10, v7 1777; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1778; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 1779; VI-NEXT: v_fma_f32 v10, v11, v10, v10 1780; VI-NEXT: v_mul_f32_e32 v11, v6, v10 1781; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 1782; VI-NEXT: v_fma_f32 v11, v12, v10, v11 1783; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 1784; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1785; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 1786; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 1787; VI-NEXT: v_trunc_f32_e32 v6, v6 1788; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 1789; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 1790; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 1791; VI-NEXT: v_rcp_f32_e32 v7, v6 1792; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 1793; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 1794; VI-NEXT: v_fma_f32 v7, v10, v7, v7 1795; VI-NEXT: v_mul_f32_e32 v10, v5, v7 1796; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 1797; VI-NEXT: v_fma_f32 v10, v11, v7, v10 1798; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 1799; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 1800; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 1801; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 1802; VI-NEXT: v_trunc_f32_e32 v5, v5 1803; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 1804; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1805; VI-NEXT: s_endpgm 1806 <4 x float> addrspace(1)* %in2) #0 { 1807 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 1808 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 1809 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 1810 %r2 = frem <4 x float> %r0, %r1 1811 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 1812 ret void 1813} 1814 1815define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, 1816; SI-LABEL: frem_v2f64: 1817; SI: ; %bb.0: 1818; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 1819; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1820; SI-NEXT: s_mov_b32 s7, 0xf000 1821; SI-NEXT: s_mov_b32 s6, -1 1822; SI-NEXT: s_waitcnt lgkmcnt(0) 1823; SI-NEXT: s_mov_b32 s4, s8 1824; SI-NEXT: s_mov_b32 s5, s9 1825; SI-NEXT: s_mov_b32 s8, s10 1826; SI-NEXT: s_mov_b32 s9, s11 1827; SI-NEXT: s_mov_b32 s10, s6 1828; SI-NEXT: s_mov_b32 s11, s7 1829; SI-NEXT: s_mov_b32 s2, s6 1830; SI-NEXT: s_mov_b32 s3, s7 1831; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1832; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 1833; SI-NEXT: s_waitcnt vmcnt(0) 1834; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 1835; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 1836; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 1837; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1838; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 1839; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1840; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] 1841; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 1842; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] 1843; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 1844; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 1845; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 1846; SI-NEXT: s_nop 1 1847; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] 1848; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 1849; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 1850; SI-NEXT: s_movk_i32 s8, 0xfc01 1851; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10 1852; SI-NEXT: s_mov_b32 s3, 0xfffff 1853; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 1854; SI-NEXT: v_not_b32_e32 v10, v10 1855; SI-NEXT: v_and_b32_e32 v10, v8, v10 1856; SI-NEXT: v_not_b32_e32 v11, v11 1857; SI-NEXT: v_and_b32_e32 v11, v9, v11 1858; SI-NEXT: s_brev_b32 s9, 1 1859; SI-NEXT: v_and_b32_e32 v13, s9, v9 1860; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 1861; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc 1862; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 1863; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] 1864; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 1865; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] 1866; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 1867; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 1868; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 1869; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1870; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1871; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1872; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1873; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] 1874; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 1875; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] 1876; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 1877; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 1878; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 1879; SI-NEXT: s_nop 1 1880; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] 1881; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 1882; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 1883; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8 1884; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 1885; SI-NEXT: v_not_b32_e32 v8, v8 1886; SI-NEXT: v_and_b32_e32 v8, v6, v8 1887; SI-NEXT: v_not_b32_e32 v9, v9 1888; SI-NEXT: v_and_b32_e32 v9, v7, v9 1889; SI-NEXT: v_and_b32_e32 v11, s9, v7 1890; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 1891; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 1892; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 1893; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] 1894; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 1895; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] 1896; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 1897; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1898; SI-NEXT: s_endpgm 1899; 1900; CI-LABEL: frem_v2f64: 1901; CI: ; %bb.0: 1902; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1903; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1904; CI-NEXT: s_mov_b32 s3, 0xf000 1905; CI-NEXT: s_mov_b32 s2, -1 1906; CI-NEXT: s_mov_b32 s10, s2 1907; CI-NEXT: s_waitcnt lgkmcnt(0) 1908; CI-NEXT: s_mov_b32 s0, s4 1909; CI-NEXT: s_mov_b32 s1, s5 1910; CI-NEXT: s_mov_b32 s4, s6 1911; CI-NEXT: s_mov_b32 s5, s7 1912; CI-NEXT: s_mov_b32 s6, s2 1913; CI-NEXT: s_mov_b32 s7, s3 1914; CI-NEXT: s_mov_b32 s11, s3 1915; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 1916; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 1917; CI-NEXT: s_waitcnt vmcnt(0) 1918; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] 1919; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 1920; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 1921; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1922; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 1923; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1924; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 1925; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 1926; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 1927; CI-NEXT: s_nop 1 1928; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 1929; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 1930; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 1931; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 1932; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] 1933; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 1934; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1935; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1936; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1937; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1938; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 1939; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 1940; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 1941; CI-NEXT: s_nop 1 1942; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 1943; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 1944; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1945; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 1946; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1947; CI-NEXT: s_endpgm 1948; 1949; VI-LABEL: frem_v2f64: 1950; VI: ; %bb.0: 1951; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1952; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1953; VI-NEXT: s_waitcnt lgkmcnt(0) 1954; VI-NEXT: v_mov_b32_e32 v0, s6 1955; VI-NEXT: s_add_u32 s0, s0, 64 1956; VI-NEXT: s_addc_u32 s1, s1, 0 1957; VI-NEXT: v_mov_b32_e32 v5, s1 1958; VI-NEXT: v_mov_b32_e32 v1, s7 1959; VI-NEXT: v_mov_b32_e32 v4, s0 1960; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1961; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1962; VI-NEXT: v_mov_b32_e32 v8, s4 1963; VI-NEXT: v_mov_b32_e32 v9, s5 1964; VI-NEXT: s_waitcnt vmcnt(0) 1965; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] 1966; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] 1967; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 1968; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 1969; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 1970; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 1971; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] 1972; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] 1973; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] 1974; VI-NEXT: s_nop 1 1975; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] 1976; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] 1977; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] 1978; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] 1979; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 1980; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] 1981; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 1982; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1983; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 1984; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 1985; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] 1986; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 1987; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] 1988; VI-NEXT: s_nop 1 1989; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] 1990; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 1991; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1992; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 1993; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1994; VI-NEXT: s_endpgm 1995 <2 x double> addrspace(1)* %in2) #0 { 1996 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 1997 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 1998 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 1999 %r2 = frem <2 x double> %r0, %r1 2000 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 2001 ret void 2002} 2003 2004attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2005attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2006