1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s 4 5; IEEE bit enabled for compute kernel, so shouldn't use. 6define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 7; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10; SI-NEXT: s_mov_b32 s7, 0xf000 11; SI-NEXT: s_mov_b32 s6, 0 12; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 13; SI-NEXT: v_mov_b32_e32 v1, 0 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 16; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 17; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 20; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2 21; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 22; SI-NEXT: s_endpgm 23; 24; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: 25; VI: ; %bb.0: 26; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 27; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 28; VI-NEXT: s_waitcnt lgkmcnt(0) 29; VI-NEXT: v_mov_b32_e32 v1, s3 30; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 31; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 32; VI-NEXT: flat_load_dword v3, v[0:1] 33; VI-NEXT: v_mov_b32_e32 v1, s1 34; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 35; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 36; VI-NEXT: s_waitcnt vmcnt(0) 37; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 38; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2 39; VI-NEXT: flat_store_dword v[0:1], v2 40; VI-NEXT: s_endpgm 41 %tid = call i32 @llvm.amdgcn.workitem.id.x() 42 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 43 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 44 %a = load float, float addrspace(1)* %gep0 45 %add = fadd float %a, 1.0 46 %div2 = fmul float %add, 0.5 47 store float %div2, float addrspace(1)* %out.gep 48 ret void 49} 50 51; IEEE bit enabled for compute kernel, so shouldn't use. 52define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrspace(1)* %out, double addrspace(1)* %aptr) #4 { 53; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: 54; SI: ; %bb.0: 55; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 56; SI-NEXT: s_mov_b32 s7, 0xf000 57; SI-NEXT: s_mov_b32 s6, 0 58; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 59; SI-NEXT: v_mov_b32_e32 v1, 0 60; SI-NEXT: s_waitcnt lgkmcnt(0) 61; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 62; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 63; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 64; SI-NEXT: s_waitcnt vmcnt(0) 65; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 66; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 67; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 68; SI-NEXT: s_endpgm 69; 70; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: 71; VI: ; %bb.0: 72; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 73; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 74; VI-NEXT: s_waitcnt lgkmcnt(0) 75; VI-NEXT: v_mov_b32_e32 v1, s3 76; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 77; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 78; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 79; VI-NEXT: v_mov_b32_e32 v3, s1 80; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 81; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 82; VI-NEXT: s_waitcnt vmcnt(0) 83; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 84; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 85; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 86; VI-NEXT: s_endpgm 87 %tid = call i32 @llvm.amdgcn.workitem.id.x() 88 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 89 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 90 %a = load double, double addrspace(1)* %gep0 91 %add = fadd double %a, 1.0 92 %div2 = fmul double %add, 0.5 93 store double %div2, double addrspace(1)* %out.gep 94 ret void 95} 96 97; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed 98define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 99; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz: 100; SI: ; %bb.0: 101; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s7, 0xf000 103; SI-NEXT: s_mov_b32 s6, 0 104; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 105; SI-NEXT: v_mov_b32_e32 v1, 0 106; SI-NEXT: s_waitcnt lgkmcnt(0) 107; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 108; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 109; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 110; SI-NEXT: s_waitcnt vmcnt(0) 111; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 112; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2 113; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 114; SI-NEXT: s_endpgm 115; 116; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz: 117; VI: ; %bb.0: 118; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 119; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 120; VI-NEXT: s_waitcnt lgkmcnt(0) 121; VI-NEXT: v_mov_b32_e32 v1, s3 122; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 123; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 124; VI-NEXT: flat_load_dword v3, v[0:1] 125; VI-NEXT: v_mov_b32_e32 v1, s1 126; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 127; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 128; VI-NEXT: s_waitcnt vmcnt(0) 129; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 130; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2 131; VI-NEXT: flat_store_dword v[0:1], v2 132; VI-NEXT: s_endpgm 133 %tid = call i32 @llvm.amdgcn.workitem.id.x() 134 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 135 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 136 %a = load float, float addrspace(1)* %gep0 137 %add = fadd float %a, 1.0 138 %div2 = fmul float %add, 0.5 139 store float %div2, float addrspace(1)* %out.gep 140 ret void 141} 142 143; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed. 144define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)* %out, double addrspace(1)* %aptr) #5 { 145; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz: 146; SI: ; %bb.0: 147; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 148; SI-NEXT: s_mov_b32 s7, 0xf000 149; SI-NEXT: s_mov_b32 s6, 0 150; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 151; SI-NEXT: v_mov_b32_e32 v1, 0 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 154; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 155; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 158; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 159; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 160; SI-NEXT: s_endpgm 161; 162; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz: 163; VI: ; %bb.0: 164; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 165; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 166; VI-NEXT: s_waitcnt lgkmcnt(0) 167; VI-NEXT: v_mov_b32_e32 v1, s3 168; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 169; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 170; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 171; VI-NEXT: v_mov_b32_e32 v3, s1 172; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 173; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 176; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 177; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 178; VI-NEXT: s_endpgm 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 180 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 181 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 182 %a = load double, double addrspace(1)* %gep0 183 %add = fadd double %a, 1.0 184 %div2 = fmul double %add, 0.5 185 store double %div2, double addrspace(1)* %out.gep 186 ret void 187} 188 189; Only allow without IEEE bit if signed zeros are significant. 190define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 { 191; SI-LABEL: v_omod_div2_f32_signed_zeros: 192; SI: ; %bb.0: 193; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 194; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0 195; SI-NEXT: s_mov_b32 s3, 0xf000 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 198; SI-NEXT: s_endpgm 199; 200; VI-LABEL: v_omod_div2_f32_signed_zeros: 201; VI: ; %bb.0: 202; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 203; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 204; VI-NEXT: flat_store_dword v[0:1], v0 205; VI-NEXT: s_endpgm 206 %add = fadd float %a, 1.0 207 %div2 = fmul float %add, 0.5 208 store float %div2, float addrspace(1)* undef 209 ret void 210} 211 212; Only allow without IEEE bit if signed zeros are significant. 213define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 { 214; SI-LABEL: v_omod_div2_f64_signed_zeros: 215; SI: ; %bb.0: 216; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 217; SI-NEXT: s_mov_b32 s3, 0xf000 218; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 219; SI-NEXT: s_mov_b32 s2, -1 220; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 221; SI-NEXT: s_endpgm 222; 223; VI-LABEL: v_omod_div2_f64_signed_zeros: 224; VI: ; %bb.0: 225; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 226; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 227; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 228; VI-NEXT: s_endpgm 229 %add = fadd double %a, 1.0 230 %div2 = fmul double %add, 0.5 231 store double %div2, double addrspace(1)* undef 232 ret void 233} 234 235define amdgpu_ps void @v_omod_div2_f32(float %a) #0 { 236; SI-LABEL: v_omod_div2_f32: 237; SI: ; %bb.0: 238; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 239; SI-NEXT: s_mov_b32 s3, 0xf000 240; SI-NEXT: s_mov_b32 s2, -1 241; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 242; SI-NEXT: s_endpgm 243; 244; VI-LABEL: v_omod_div2_f32: 245; VI: ; %bb.0: 246; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 247; VI-NEXT: flat_store_dword v[0:1], v0 248; VI-NEXT: s_endpgm 249 %add = fadd float %a, 1.0 250 %div2 = fmul float %add, 0.5 251 store float %div2, float addrspace(1)* undef 252 ret void 253} 254 255define amdgpu_ps void @v_omod_div2_f64(double %a) #5 { 256; SI-LABEL: v_omod_div2_f64: 257; SI: ; %bb.0: 258; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 259; SI-NEXT: s_mov_b32 s3, 0xf000 260; SI-NEXT: s_mov_b32 s2, -1 261; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 262; SI-NEXT: s_endpgm 263; 264; VI-LABEL: v_omod_div2_f64: 265; VI: ; %bb.0: 266; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 267; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 268; VI-NEXT: s_endpgm 269 %add = fadd nsz double %a, 1.0 270 %div2 = fmul nsz double %add, 0.5 271 store double %div2, double addrspace(1)* undef 272 ret void 273} 274 275define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 { 276; SI-LABEL: v_omod_mul2_f32: 277; SI: ; %bb.0: 278; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 279; SI-NEXT: s_mov_b32 s3, 0xf000 280; SI-NEXT: s_mov_b32 s2, -1 281; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 282; SI-NEXT: s_endpgm 283; 284; VI-LABEL: v_omod_mul2_f32: 285; VI: ; %bb.0: 286; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 287; VI-NEXT: flat_store_dword v[0:1], v0 288; VI-NEXT: s_endpgm 289 %add = fadd float %a, 1.0 290 %div2 = fmul float %add, 2.0 291 store float %div2, float addrspace(1)* undef 292 ret void 293} 294 295define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 { 296; SI-LABEL: v_omod_mul2_f64: 297; SI: ; %bb.0: 298; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 299; SI-NEXT: s_mov_b32 s3, 0xf000 300; SI-NEXT: s_mov_b32 s2, -1 301; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 302; SI-NEXT: s_endpgm 303; 304; VI-LABEL: v_omod_mul2_f64: 305; VI: ; %bb.0: 306; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 307; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 308; VI-NEXT: s_endpgm 309 %add = fadd nsz double %a, 1.0 310 %div2 = fmul nsz double %add, 2.0 311 store double %div2, double addrspace(1)* undef 312 ret void 313} 314 315define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 { 316; SI-LABEL: v_omod_mul4_f32: 317; SI: ; %bb.0: 318; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 319; SI-NEXT: s_mov_b32 s3, 0xf000 320; SI-NEXT: s_mov_b32 s2, -1 321; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 322; SI-NEXT: s_endpgm 323; 324; VI-LABEL: v_omod_mul4_f32: 325; VI: ; %bb.0: 326; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 327; VI-NEXT: flat_store_dword v[0:1], v0 328; VI-NEXT: s_endpgm 329 %add = fadd float %a, 1.0 330 %div2 = fmul float %add, 4.0 331 store float %div2, float addrspace(1)* undef 332 ret void 333} 334 335define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 { 336; SI-LABEL: v_omod_mul4_f64: 337; SI: ; %bb.0: 338; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 339; SI-NEXT: s_mov_b32 s3, 0xf000 340; SI-NEXT: s_mov_b32 s2, -1 341; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 342; SI-NEXT: s_endpgm 343; 344; VI-LABEL: v_omod_mul4_f64: 345; VI: ; %bb.0: 346; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 347; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 348; VI-NEXT: s_endpgm 349 %add = fadd nsz double %a, 1.0 350 %div2 = fmul nsz double %add, 4.0 351 store double %div2, double addrspace(1)* undef 352 ret void 353} 354 355define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 { 356; SI-LABEL: v_omod_mul4_multi_use_f32: 357; SI: ; %bb.0: 358; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 359; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 360; SI-NEXT: s_mov_b32 s3, 0xf000 361; SI-NEXT: s_mov_b32 s2, -1 362; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 363; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 364; SI-NEXT: s_waitcnt vmcnt(0) 365; SI-NEXT: s_endpgm 366; 367; VI-LABEL: v_omod_mul4_multi_use_f32: 368; VI: ; %bb.0: 369; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 370; VI-NEXT: v_mul_f32_e32 v1, 4.0, v0 371; VI-NEXT: flat_store_dword v[0:1], v1 372; VI-NEXT: flat_store_dword v[0:1], v0 373; VI-NEXT: s_waitcnt vmcnt(0) 374; VI-NEXT: s_endpgm 375 %add = fadd float %a, 1.0 376 %div2 = fmul float %add, 4.0 377 store float %div2, float addrspace(1)* undef 378 store volatile float %add, float addrspace(1)* undef 379 ret void 380} 381 382define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 { 383; SI-LABEL: v_omod_mul4_dbg_use_f32: 384; SI: ; %bb.0: 385; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 386; SI-NEXT: s_mov_b32 s3, 0xf000 387; SI-NEXT: s_mov_b32 s2, -1 388; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; SI-NEXT: s_endpgm 390; 391; VI-LABEL: v_omod_mul4_dbg_use_f32: 392; VI: ; %bb.0: 393; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 394; VI-NEXT: flat_store_dword v[0:1], v0 395; VI-NEXT: s_endpgm 396 %add = fadd float %a, 1.0 397 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10 398 %div2 = fmul float %add, 4.0 399 store float %div2, float addrspace(1)* undef 400 ret void 401} 402 403; Clamp is applied after omod, folding both into instruction is OK. 404define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 { 405; SI-LABEL: v_clamp_omod_div2_f32: 406; SI: ; %bb.0: 407; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 408; SI-NEXT: s_mov_b32 s3, 0xf000 409; SI-NEXT: s_mov_b32 s2, -1 410; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 411; SI-NEXT: s_endpgm 412; 413; VI-LABEL: v_clamp_omod_div2_f32: 414; VI: ; %bb.0: 415; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 416; VI-NEXT: flat_store_dword v[0:1], v0 417; VI-NEXT: s_endpgm 418 %add = fadd float %a, 1.0 419 %div2 = fmul float %add, 0.5 420 421 %max = call float @llvm.maxnum.f32(float %div2, float 0.0) 422 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 423 store float %clamp, float addrspace(1)* undef 424 ret void 425} 426 427; Cannot fold omod into clamp 428define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 { 429; SI-LABEL: v_omod_div2_clamp_f32: 430; SI: ; %bb.0: 431; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp 432; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0 433; SI-NEXT: s_mov_b32 s3, 0xf000 434; SI-NEXT: s_mov_b32 s2, -1 435; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 436; SI-NEXT: s_endpgm 437; 438; VI-LABEL: v_omod_div2_clamp_f32: 439; VI: ; %bb.0: 440; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp 441; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 442; VI-NEXT: flat_store_dword v[0:1], v0 443; VI-NEXT: s_endpgm 444 %add = fadd float %a, 1.0 445 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 446 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 447 %div2 = fmul float %clamp, 0.5 448 store float %div2, float addrspace(1)* undef 449 ret void 450} 451 452define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 { 453; SI-LABEL: v_omod_div2_abs_src_f32: 454; SI: ; %bb.0: 455; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 456; SI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 457; SI-NEXT: s_mov_b32 s3, 0xf000 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 460; SI-NEXT: s_endpgm 461; 462; VI-LABEL: v_omod_div2_abs_src_f32: 463; VI: ; %bb.0: 464; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 465; VI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 466; VI-NEXT: flat_store_dword v[0:1], v0 467; VI-NEXT: s_endpgm 468 %add = fadd float %a, 1.0 469 %abs.add = call float @llvm.fabs.f32(float %add) 470 %div2 = fmul float %abs.add, 0.5 471 store float %div2, float addrspace(1)* undef 472 ret void 473} 474 475define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 { 476; SI-LABEL: v_omod_add_self_clamp_f32: 477; SI: ; %bb.0: 478; SI-NEXT: v_add_f32_e64 v0, v0, v0 clamp 479; SI-NEXT: s_mov_b32 s3, 0xf000 480; SI-NEXT: s_mov_b32 s2, -1 481; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; SI-NEXT: s_endpgm 483; 484; VI-LABEL: v_omod_add_self_clamp_f32: 485; VI: ; %bb.0: 486; VI-NEXT: v_add_f32_e64 v0, v0, v0 clamp 487; VI-NEXT: flat_store_dword v[0:1], v0 488; VI-NEXT: s_endpgm 489 %add = fadd float %a, %a 490 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 491 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 492 store float %clamp, float addrspace(1)* undef 493 ret void 494} 495 496define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 { 497; SI-LABEL: v_omod_add_clamp_self_f32: 498; SI: ; %bb.0: 499; SI-NEXT: v_max_f32_e64 v0, v0, v0 clamp 500; SI-NEXT: v_add_f32_e32 v0, v0, v0 501; SI-NEXT: s_mov_b32 s3, 0xf000 502; SI-NEXT: s_mov_b32 s2, -1 503; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 504; SI-NEXT: s_endpgm 505; 506; VI-LABEL: v_omod_add_clamp_self_f32: 507; VI: ; %bb.0: 508; VI-NEXT: v_max_f32_e64 v0, v0, v0 clamp 509; VI-NEXT: v_add_f32_e32 v0, v0, v0 510; VI-NEXT: flat_store_dword v[0:1], v0 511; VI-NEXT: s_endpgm 512 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 513 %clamp = call float @llvm.minnum.f32(float %max, float 1.0) 514 %add = fadd float %clamp, %clamp 515 store float %add, float addrspace(1)* undef 516 ret void 517} 518 519define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 { 520; SI-LABEL: v_omod_add_abs_self_f32: 521; SI: ; %bb.0: 522; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 523; SI-NEXT: v_add_f32_e64 v0, |v0|, |v0| 524; SI-NEXT: s_mov_b32 s3, 0xf000 525; SI-NEXT: s_mov_b32 s2, -1 526; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 527; SI-NEXT: s_endpgm 528; 529; VI-LABEL: v_omod_add_abs_self_f32: 530; VI: ; %bb.0: 531; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 532; VI-NEXT: v_add_f32_e64 v0, |v0|, |v0| 533; VI-NEXT: flat_store_dword v[0:1], v0 534; VI-NEXT: s_endpgm 535 %x = fadd float %a, 1.0 536 %abs.x = call float @llvm.fabs.f32(float %x) 537 %add = fadd float %abs.x, %abs.x 538 store float %add, float addrspace(1)* undef 539 ret void 540} 541 542define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 { 543; SI-LABEL: v_omod_add_abs_x_x_f32: 544; SI: ; %bb.0: 545; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 546; SI-NEXT: v_add_f32_e64 v0, |v0|, v0 547; SI-NEXT: s_mov_b32 s3, 0xf000 548; SI-NEXT: s_mov_b32 s2, -1 549; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 550; SI-NEXT: s_endpgm 551; 552; VI-LABEL: v_omod_add_abs_x_x_f32: 553; VI: ; %bb.0: 554; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 555; VI-NEXT: v_add_f32_e64 v0, |v0|, v0 556; VI-NEXT: flat_store_dword v[0:1], v0 557; VI-NEXT: s_endpgm 558 %x = fadd float %a, 1.0 559 %abs.x = call float @llvm.fabs.f32(float %x) 560 %add = fadd float %abs.x, %x 561 store float %add, float addrspace(1)* undef 562 ret void 563} 564 565define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 { 566; SI-LABEL: v_omod_add_x_abs_x_f32: 567; SI: ; %bb.0: 568; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 569; SI-NEXT: v_add_f32_e64 v0, v0, |v0| 570; SI-NEXT: s_mov_b32 s3, 0xf000 571; SI-NEXT: s_mov_b32 s2, -1 572; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 573; SI-NEXT: s_endpgm 574; 575; VI-LABEL: v_omod_add_x_abs_x_f32: 576; VI: ; %bb.0: 577; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 578; VI-NEXT: v_add_f32_e64 v0, v0, |v0| 579; VI-NEXT: flat_store_dword v[0:1], v0 580; VI-NEXT: s_endpgm 581 %x = fadd float %a, 1.0 582 %abs.x = call float @llvm.fabs.f32(float %x) 583 %add = fadd float %x, %abs.x 584 store float %add, float addrspace(1)* undef 585 ret void 586} 587 588; Don't fold omod into omod into another omod. 589define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 { 590; SI-LABEL: v_omod_div2_omod_div2_f32: 591; SI: ; %bb.0: 592; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 593; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0 594; SI-NEXT: s_mov_b32 s3, 0xf000 595; SI-NEXT: s_mov_b32 s2, -1 596; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 597; SI-NEXT: s_endpgm 598; 599; VI-LABEL: v_omod_div2_omod_div2_f32: 600; VI: ; %bb.0: 601; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 602; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 603; VI-NEXT: flat_store_dword v[0:1], v0 604; VI-NEXT: s_endpgm 605 %add = fadd float %a, 1.0 606 %div2.0 = fmul float %add, 0.5 607 %div2.1 = fmul float %div2.0, 0.5 608 store float %div2.1, float addrspace(1)* undef 609 ret void 610} 611 612; Don't fold omod if denorms enabled 613define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 { 614; SI-LABEL: v_omod_div2_f32_denormals: 615; SI: ; %bb.0: 616; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 617; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0 618; SI-NEXT: s_mov_b32 s3, 0xf000 619; SI-NEXT: s_mov_b32 s2, -1 620; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 621; SI-NEXT: s_endpgm 622; 623; VI-LABEL: v_omod_div2_f32_denormals: 624; VI: ; %bb.0: 625; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 626; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0 627; VI-NEXT: flat_store_dword v[0:1], v0 628; VI-NEXT: s_endpgm 629 %add = fadd float %a, 1.0 630 %div2 = fmul float %add, 0.5 631 store float %div2, float addrspace(1)* undef 632 ret void 633} 634 635; Don't fold omod if denorms enabled. 636define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 { 637; SI-LABEL: v_omod_div2_f64_denormals: 638; SI: ; %bb.0: 639; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 640; SI-NEXT: s_mov_b32 s3, 0xf000 641; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 642; SI-NEXT: s_mov_b32 s2, -1 643; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 644; SI-NEXT: s_endpgm 645; 646; VI-LABEL: v_omod_div2_f64_denormals: 647; VI: ; %bb.0: 648; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 649; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 650; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 651; VI-NEXT: s_endpgm 652 %add = fadd double %a, 1.0 653 %div2 = fmul double %add, 0.5 654 store double %div2, double addrspace(1)* undef 655 ret void 656} 657 658; Don't fold omod if denorms enabled for add form. 659define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 { 660; SI-LABEL: v_omod_mul2_f32_denormals: 661; SI: ; %bb.0: 662; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 663; SI-NEXT: v_add_f32_e32 v0, v0, v0 664; SI-NEXT: s_mov_b32 s3, 0xf000 665; SI-NEXT: s_mov_b32 s2, -1 666; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 667; SI-NEXT: s_endpgm 668; 669; VI-LABEL: v_omod_mul2_f32_denormals: 670; VI: ; %bb.0: 671; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 672; VI-NEXT: v_add_f32_e32 v0, v0, v0 673; VI-NEXT: flat_store_dword v[0:1], v0 674; VI-NEXT: s_endpgm 675 %add = fadd float %a, 1.0 676 %mul2 = fadd float %add, %add 677 store float %mul2, float addrspace(1)* undef 678 ret void 679} 680 681; Don't fold omod if denorms enabled for add form. 682define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { 683; SI-LABEL: v_omod_mul2_f64_denormals: 684; SI: ; %bb.0: 685; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 686; SI-NEXT: s_mov_b32 s3, 0xf000 687; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] 688; SI-NEXT: s_mov_b32 s2, -1 689; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 690; SI-NEXT: s_endpgm 691; 692; VI-LABEL: v_omod_mul2_f64_denormals: 693; VI: ; %bb.0: 694; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 695; VI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] 696; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 697; VI-NEXT: s_endpgm 698 %add = fadd double %a, 1.0 699 %mul2 = fadd double %add, %add 700 store double %mul2, double addrspace(1)* undef 701 ret void 702} 703 704; Don't fold omod if denorms enabled 705define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { 706; SI-LABEL: v_omod_div2_f16_denormals: 707; SI: ; %bb.0: 708; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 709; SI-NEXT: s_mov_b32 s3, 0xf000 710; SI-NEXT: s_mov_b32 s2, -1 711; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 712; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 713; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 714; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 715; SI-NEXT: s_endpgm 716; 717; VI-LABEL: v_omod_div2_f16_denormals: 718; VI: ; %bb.0: 719; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 720; VI-NEXT: v_mul_f16_e32 v0, 0.5, v0 721; VI-NEXT: flat_store_short v[0:1], v0 722; VI-NEXT: s_endpgm 723 %add = fadd half %a, 1.0 724 %div2 = fmul half %add, 0.5 725 store half %div2, half addrspace(1)* undef 726 ret void 727} 728 729; Don't fold omod if denorms enabled for add form. 730define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { 731; SI-LABEL: v_omod_mul2_f16_denormals: 732; SI: ; %bb.0: 733; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 734; SI-NEXT: s_mov_b32 s3, 0xf000 735; SI-NEXT: s_mov_b32 s2, -1 736; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 737; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 738; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 739; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 740; SI-NEXT: s_endpgm 741; 742; VI-LABEL: v_omod_mul2_f16_denormals: 743; VI: ; %bb.0: 744; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 745; VI-NEXT: v_add_f16_e32 v0, v0, v0 746; VI-NEXT: flat_store_short v[0:1], v0 747; VI-NEXT: s_endpgm 748 %add = fadd half %a, 1.0 749 %mul2 = fadd half %add, %add 750 store half %mul2, half addrspace(1)* undef 751 ret void 752} 753 754define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { 755; SI-LABEL: v_omod_div2_f16_no_denormals: 756; SI: ; %bb.0: 757; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 758; SI-NEXT: s_mov_b32 s3, 0xf000 759; SI-NEXT: s_mov_b32 s2, -1 760; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 761; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 762; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 763; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 764; SI-NEXT: s_endpgm 765; 766; VI-LABEL: v_omod_div2_f16_no_denormals: 767; VI: ; %bb.0: 768; VI-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2 769; VI-NEXT: flat_store_short v[0:1], v0 770; VI-NEXT: s_endpgm 771 %add = fadd half %a, 1.0 772 %div2 = fmul half %add, 0.5 773 store half %div2, half addrspace(1)* undef 774 ret void 775} 776 777define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 { 778; SI-LABEL: v_omod_mac_to_mad: 779; SI: ; %bb.0: 780; SI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2 781; SI-NEXT: v_mul_f32_e32 v0, v1, v0 782; SI-NEXT: s_mov_b32 s3, 0xf000 783; SI-NEXT: s_mov_b32 s2, -1 784; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 785; SI-NEXT: s_endpgm 786; 787; VI-LABEL: v_omod_mac_to_mad: 788; VI: ; %bb.0: 789; VI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2 790; VI-NEXT: v_mul_f32_e32 v0, v1, v0 791; VI-NEXT: flat_store_dword v[0:1], v0 792; VI-NEXT: s_endpgm 793 %mul = fmul float %a, %a 794 %add = fadd float %mul, %b 795 %mad = fmul float %add, 2.0 796 %res = fmul float %mad, %b 797 store float %res, float addrspace(1)* undef 798 ret void 799} 800 801declare i32 @llvm.amdgcn.workitem.id.x() #1 802declare float @llvm.fabs.f32(float) #1 803declare float @llvm.floor.f32(float) #1 804declare float @llvm.minnum.f32(float, float) #1 805declare float @llvm.maxnum.f32(float, float) #1 806declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 807declare double @llvm.fabs.f64(double) #1 808declare double @llvm.minnum.f64(double, double) #1 809declare double @llvm.maxnum.f64(double, double) #1 810declare half @llvm.fabs.f16(half) #1 811declare half @llvm.minnum.f16(half, half) #1 812declare half @llvm.maxnum.f16(half, half) #1 813declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 814 815attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" } 816attributes #1 = { nounwind readnone } 817attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" } 818attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" } 819attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } 820attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } 821attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" } 822 823!llvm.dbg.cu = !{!0} 824!llvm.module.flags = !{!2, !3} 825 826!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug) 827!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null") 828!2 = !{i32 2, !"Dwarf Version", i32 4} 829!3 = !{i32 2, !"Debug Info Version", i32 3} 830!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1) 831!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0) 832!6 = !DISubroutineType(types: !7) 833!7 = !{null, !8} 834!8 = !DIBasicType(name: "float", size: 32, align: 32) 835!9 = !DIExpression() 836!10 = !DILocation(line: 1, column: 42, scope: !5) 837