1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 6 7define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 8; GFX6-LABEL: v_clamp_f32: 9; GFX6: ; %bb.0: 10; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11; GFX6-NEXT: s_mov_b32 s7, 0xf000 12; GFX6-NEXT: s_mov_b32 s6, 0 13; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 14; GFX6-NEXT: v_mov_b32_e32 v1, 0 15; GFX6-NEXT: s_waitcnt lgkmcnt(0) 16; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 17; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 18; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 19; GFX6-NEXT: s_waitcnt vmcnt(0) 20; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 21; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 22; GFX6-NEXT: s_endpgm 23; 24; GFX8-LABEL: v_clamp_f32: 25; GFX8: ; %bb.0: 26; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 27; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 28; GFX8-NEXT: s_waitcnt lgkmcnt(0) 29; GFX8-NEXT: v_mov_b32_e32 v1, s3 30; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 31; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 32; GFX8-NEXT: flat_load_dword v3, v[0:1] 33; GFX8-NEXT: v_mov_b32_e32 v1, s1 34; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 35; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 36; GFX8-NEXT: s_waitcnt vmcnt(0) 37; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 38; GFX8-NEXT: flat_store_dword v[0:1], v2 39; GFX8-NEXT: s_endpgm 40; 41; GFX9-LABEL: v_clamp_f32: 42; GFX9: ; %bb.0: 43; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 44; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 45; GFX9-NEXT: s_waitcnt lgkmcnt(0) 46; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 47; GFX9-NEXT: s_waitcnt vmcnt(0) 48; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 49; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 50; GFX9-NEXT: s_endpgm 51; 52; GFX11-LABEL: v_clamp_f32: 53; GFX11: ; %bb.0: 54; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 55; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 56; GFX11-NEXT: s_waitcnt lgkmcnt(0) 57; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 58; GFX11-NEXT: s_waitcnt vmcnt(0) 59; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 60; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 61; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 62; GFX11-NEXT: s_endpgm 63 %tid = call i32 @llvm.amdgcn.workitem.id.x() 64 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 65 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 66 %a = load float, float addrspace(1)* %gep0 67 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 68 %med = call float @llvm.minnum.f32(float %max, float 1.0) 69 70 store float %med, float addrspace(1)* %out.gep 71 ret void 72} 73 74define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 75; GFX6-LABEL: v_clamp_neg_f32: 76; GFX6: ; %bb.0: 77; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 78; GFX6-NEXT: s_mov_b32 s7, 0xf000 79; GFX6-NEXT: s_mov_b32 s6, 0 80; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 81; GFX6-NEXT: v_mov_b32_e32 v1, 0 82; GFX6-NEXT: s_waitcnt lgkmcnt(0) 83; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 84; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 85; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 86; GFX6-NEXT: s_waitcnt vmcnt(0) 87; GFX6-NEXT: v_max_f32_e64 v2, -v2, -v2 clamp 88; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 89; GFX6-NEXT: s_endpgm 90; 91; GFX8-LABEL: v_clamp_neg_f32: 92; GFX8: ; %bb.0: 93; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 94; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 95; GFX8-NEXT: s_waitcnt lgkmcnt(0) 96; GFX8-NEXT: v_mov_b32_e32 v1, s3 97; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 98; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 99; GFX8-NEXT: flat_load_dword v3, v[0:1] 100; GFX8-NEXT: v_mov_b32_e32 v1, s1 101; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 102; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 103; GFX8-NEXT: s_waitcnt vmcnt(0) 104; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp 105; GFX8-NEXT: flat_store_dword v[0:1], v2 106; GFX8-NEXT: s_endpgm 107; 108; GFX9-LABEL: v_clamp_neg_f32: 109; GFX9: ; %bb.0: 110; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 111; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 113; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 114; GFX9-NEXT: s_waitcnt vmcnt(0) 115; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 116; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 117; GFX9-NEXT: s_endpgm 118; 119; GFX11-LABEL: v_clamp_neg_f32: 120; GFX11: ; %bb.0: 121; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 122; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 123; GFX11-NEXT: s_waitcnt lgkmcnt(0) 124; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 125; GFX11-NEXT: s_waitcnt vmcnt(0) 126; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp 127; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 128; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 129; GFX11-NEXT: s_endpgm 130 %tid = call i32 @llvm.amdgcn.workitem.id.x() 131 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 132 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 133 %a = load float, float addrspace(1)* %gep0 134 %fneg.a = fneg float %a 135 %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) 136 %med = call float @llvm.minnum.f32(float %max, float 1.0) 137 138 store float %med, float addrspace(1)* %out.gep 139 ret void 140} 141 142define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 143; GFX6-LABEL: v_clamp_negabs_f32: 144; GFX6: ; %bb.0: 145; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 146; GFX6-NEXT: s_mov_b32 s7, 0xf000 147; GFX6-NEXT: s_mov_b32 s6, 0 148; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 149; GFX6-NEXT: v_mov_b32_e32 v1, 0 150; GFX6-NEXT: s_waitcnt lgkmcnt(0) 151; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 152; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 153; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 154; GFX6-NEXT: s_waitcnt vmcnt(0) 155; GFX6-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| clamp 156; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 157; GFX6-NEXT: s_endpgm 158; 159; GFX8-LABEL: v_clamp_negabs_f32: 160; GFX8: ; %bb.0: 161; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 162; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 163; GFX8-NEXT: s_waitcnt lgkmcnt(0) 164; GFX8-NEXT: v_mov_b32_e32 v1, s3 165; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 166; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 167; GFX8-NEXT: flat_load_dword v3, v[0:1] 168; GFX8-NEXT: v_mov_b32_e32 v1, s1 169; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 170; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 171; GFX8-NEXT: s_waitcnt vmcnt(0) 172; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp 173; GFX8-NEXT: flat_store_dword v[0:1], v2 174; GFX8-NEXT: s_endpgm 175; 176; GFX9-LABEL: v_clamp_negabs_f32: 177; GFX9: ; %bb.0: 178; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 179; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 181; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 182; GFX9-NEXT: s_waitcnt vmcnt(0) 183; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp 184; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 185; GFX9-NEXT: s_endpgm 186; 187; GFX11-LABEL: v_clamp_negabs_f32: 188; GFX11: ; %bb.0: 189; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 190; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 191; GFX11-NEXT: s_waitcnt lgkmcnt(0) 192; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 193; GFX11-NEXT: s_waitcnt vmcnt(0) 194; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp 195; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 196; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 197; GFX11-NEXT: s_endpgm 198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 199 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 200 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 201 %a = load float, float addrspace(1)* %gep0 202 %fabs.a = call float @llvm.fabs.f32(float %a) 203 %fneg.fabs.a = fneg float %fabs.a 204 205 %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0) 206 %med = call float @llvm.minnum.f32(float %max, float 1.0) 207 208 store float %med, float addrspace(1)* %out.gep 209 ret void 210} 211 212define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 213; GFX6-LABEL: v_clamp_negzero_f32: 214; GFX6: ; %bb.0: 215; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 216; GFX6-NEXT: s_mov_b32 s7, 0xf000 217; GFX6-NEXT: s_mov_b32 s6, 0 218; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 219; GFX6-NEXT: v_mov_b32_e32 v1, 0 220; GFX6-NEXT: s_waitcnt lgkmcnt(0) 221; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 222; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 223; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 224; GFX6-NEXT: s_waitcnt vmcnt(0) 225; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2 226; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2 227; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 228; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 229; GFX6-NEXT: s_endpgm 230; 231; GFX8-LABEL: v_clamp_negzero_f32: 232; GFX8: ; %bb.0: 233; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 234; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mov_b32_e32 v1, s3 237; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 238; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 239; GFX8-NEXT: flat_load_dword v3, v[0:1] 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 241; GFX8-NEXT: v_mov_b32_e32 v1, s1 242; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 243; GFX8-NEXT: s_waitcnt vmcnt(0) 244; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3 245; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2 246; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 247; GFX8-NEXT: flat_store_dword v[0:1], v2 248; GFX8-NEXT: s_endpgm 249; 250; GFX9-LABEL: v_clamp_negzero_f32: 251; GFX9: ; %bb.0: 252; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 253; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 254; GFX9-NEXT: s_waitcnt lgkmcnt(0) 255; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 256; GFX9-NEXT: s_waitcnt vmcnt(0) 257; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 258; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 259; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 260; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 261; GFX9-NEXT: s_endpgm 262; 263; GFX11-LABEL: v_clamp_negzero_f32: 264; GFX11: ; %bb.0: 265; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 266; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 267; GFX11-NEXT: s_waitcnt lgkmcnt(0) 268; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 269; GFX11-NEXT: s_waitcnt vmcnt(0) 270; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 271; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 272; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 273; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 274; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 275; GFX11-NEXT: s_endpgm 276 %tid = call i32 @llvm.amdgcn.workitem.id.x() 277 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 278 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 279 %a = load float, float addrspace(1)* %gep0 280 %add = fadd nnan float %a, 0.5 281 %max = call float @llvm.maxnum.f32(float %add, float -0.0) 282 %med = call float @llvm.minnum.f32(float %max, float 1.0) 283 284 store float %med, float addrspace(1)* %out.gep 285 ret void 286} 287 288; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp 289; matched through med3, not if directly. Is this correct? 290define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 291; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: 292; GFX6: ; %bb.0: 293; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 294; GFX6-NEXT: s_mov_b32 s7, 0xf000 295; GFX6-NEXT: s_mov_b32 s6, 0 296; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 297; GFX6-NEXT: v_mov_b32_e32 v1, 0 298; GFX6-NEXT: s_waitcnt lgkmcnt(0) 299; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 300; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 301; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 302; GFX6-NEXT: s_waitcnt vmcnt(0) 303; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 304; GFX6-NEXT: v_max_f32_e32 v2, 0x80000000, v2 305; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 306; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 307; GFX6-NEXT: s_endpgm 308; 309; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: 310; GFX8: ; %bb.0: 311; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 312; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 313; GFX8-NEXT: s_waitcnt lgkmcnt(0) 314; GFX8-NEXT: v_mov_b32_e32 v1, s3 315; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 316; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 317; GFX8-NEXT: flat_load_dword v3, v[0:1] 318; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 319; GFX8-NEXT: v_mov_b32_e32 v1, s1 320; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 321; GFX8-NEXT: s_waitcnt vmcnt(0) 322; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 323; GFX8-NEXT: v_max_f32_e32 v2, 0x80000000, v2 324; GFX8-NEXT: v_min_f32_e32 v2, 1.0, v2 325; GFX8-NEXT: flat_store_dword v[0:1], v2 326; GFX8-NEXT: s_endpgm 327; 328; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: 329; GFX9: ; %bb.0: 330; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 331; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 334; GFX9-NEXT: s_waitcnt vmcnt(0) 335; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 336; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 337; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 338; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 339; GFX9-NEXT: s_endpgm 340; 341; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: 342; GFX11: ; %bb.0: 343; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 344; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 345; GFX11-NEXT: s_waitcnt lgkmcnt(0) 346; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 347; GFX11-NEXT: s_waitcnt vmcnt(0) 348; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 349; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 350; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 351; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 352; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 353; GFX11-NEXT: s_endpgm 354 %tid = call i32 @llvm.amdgcn.workitem.id.x() 355 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 356 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 357 %a = load float, float addrspace(1)* %gep0 358 %max = call float @llvm.maxnum.f32(float %a, float -0.0) 359 %med = call float @llvm.minnum.f32(float %max, float 1.0) 360 361 store float %med, float addrspace(1)* %out.gep 362 ret void 363} 364 365define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 366; GFX6-LABEL: v_clamp_multi_use_max_f32: 367; GFX6: ; %bb.0: 368; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 369; GFX6-NEXT: s_mov_b32 s6, 0 370; GFX6-NEXT: s_mov_b32 s7, 0xf000 371; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 372; GFX6-NEXT: v_mov_b32_e32 v1, 0 373; GFX6-NEXT: s_waitcnt lgkmcnt(0) 374; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 375; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 376; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 377; GFX6-NEXT: s_mov_b32 s6, -1 378; GFX6-NEXT: s_waitcnt vmcnt(0) 379; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 380; GFX6-NEXT: v_max_f32_e32 v2, 0, v2 381; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v2 382; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 383; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 384; GFX6-NEXT: s_waitcnt vmcnt(0) 385; GFX6-NEXT: s_endpgm 386; 387; GFX8-LABEL: v_clamp_multi_use_max_f32: 388; GFX8: ; %bb.0: 389; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 390; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 391; GFX8-NEXT: s_waitcnt lgkmcnt(0) 392; GFX8-NEXT: v_mov_b32_e32 v1, s3 393; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 394; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 395; GFX8-NEXT: flat_load_dword v3, v[0:1] 396; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 397; GFX8-NEXT: v_mov_b32_e32 v1, s1 398; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 399; GFX8-NEXT: s_waitcnt vmcnt(0) 400; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 401; GFX8-NEXT: v_max_f32_e32 v2, 0, v2 402; GFX8-NEXT: v_min_f32_e32 v3, 1.0, v2 403; GFX8-NEXT: flat_store_dword v[0:1], v3 404; GFX8-NEXT: flat_store_dword v[0:1], v2 405; GFX8-NEXT: s_waitcnt vmcnt(0) 406; GFX8-NEXT: s_endpgm 407; 408; GFX9-LABEL: v_clamp_multi_use_max_f32: 409; GFX9: ; %bb.0: 410; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 411; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 413; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 414; GFX9-NEXT: s_waitcnt vmcnt(0) 415; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 416; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 417; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1 418; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 419; GFX9-NEXT: global_store_dword v[0:1], v1, off 420; GFX9-NEXT: s_waitcnt vmcnt(0) 421; GFX9-NEXT: s_endpgm 422; 423; GFX11-LABEL: v_clamp_multi_use_max_f32: 424; GFX11: ; %bb.0: 425; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 426; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 427; GFX11-NEXT: s_waitcnt lgkmcnt(0) 428; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 429; GFX11-NEXT: s_waitcnt vmcnt(0) 430; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 431; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 432; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 433; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 434; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 435; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc 436; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 437; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 438; GFX11-NEXT: s_endpgm 439 %tid = call i32 @llvm.amdgcn.workitem.id.x() 440 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 441 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 442 %a = load float, float addrspace(1)* %gep0 443 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 444 %med = call float @llvm.minnum.f32(float %max, float 1.0) 445 446 store float %med, float addrspace(1)* %out.gep 447 store volatile float %max, float addrspace(1)* undef 448 ret void 449} 450 451define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 452; GFX6-LABEL: v_clamp_f16: 453; GFX6: ; %bb.0: 454; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 455; GFX6-NEXT: s_mov_b32 s7, 0xf000 456; GFX6-NEXT: s_mov_b32 s6, 0 457; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 458; GFX6-NEXT: v_mov_b32_e32 v1, 0 459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 460; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 461; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 462; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 463; GFX6-NEXT: s_waitcnt vmcnt(0) 464; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 465; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 466; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 467; GFX6-NEXT: s_endpgm 468; 469; GFX8-LABEL: v_clamp_f16: 470; GFX8: ; %bb.0: 471; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 472; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 473; GFX8-NEXT: s_waitcnt lgkmcnt(0) 474; GFX8-NEXT: v_mov_b32_e32 v1, s3 475; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 476; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 477; GFX8-NEXT: flat_load_ushort v3, v[0:1] 478; GFX8-NEXT: v_mov_b32_e32 v1, s1 479; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 480; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 481; GFX8-NEXT: s_waitcnt vmcnt(0) 482; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp 483; GFX8-NEXT: flat_store_short v[0:1], v2 484; GFX8-NEXT: s_endpgm 485; 486; GFX9-LABEL: v_clamp_f16: 487; GFX9: ; %bb.0: 488; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 489; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 490; GFX9-NEXT: s_waitcnt lgkmcnt(0) 491; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 492; GFX9-NEXT: s_waitcnt vmcnt(0) 493; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp 494; GFX9-NEXT: global_store_short v0, v1, s[0:1] 495; GFX9-NEXT: s_endpgm 496; 497; GFX11-LABEL: v_clamp_f16: 498; GFX11: ; %bb.0: 499; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 500; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 501; GFX11-NEXT: s_waitcnt lgkmcnt(0) 502; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 503; GFX11-NEXT: s_waitcnt vmcnt(0) 504; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp 505; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 506; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 507; GFX11-NEXT: s_endpgm 508 %tid = call i32 @llvm.amdgcn.workitem.id.x() 509 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 510 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 511 %a = load half, half addrspace(1)* %gep0 512 %max = call half @llvm.maxnum.f16(half %a, half 0.0) 513 %med = call half @llvm.minnum.f16(half %max, half 1.0) 514 515 store half %med, half addrspace(1)* %out.gep 516 ret void 517} 518 519define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 520; GFX6-LABEL: v_clamp_neg_f16: 521; GFX6: ; %bb.0: 522; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 523; GFX6-NEXT: s_mov_b32 s7, 0xf000 524; GFX6-NEXT: s_mov_b32 s6, 0 525; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 526; GFX6-NEXT: v_mov_b32_e32 v1, 0 527; GFX6-NEXT: s_waitcnt lgkmcnt(0) 528; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 529; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 530; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 531; GFX6-NEXT: s_waitcnt vmcnt(0) 532; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp 533; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 534; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 535; GFX6-NEXT: s_endpgm 536; 537; GFX8-LABEL: v_clamp_neg_f16: 538; GFX8: ; %bb.0: 539; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 540; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 541; GFX8-NEXT: s_waitcnt lgkmcnt(0) 542; GFX8-NEXT: v_mov_b32_e32 v1, s3 543; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 544; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 545; GFX8-NEXT: flat_load_ushort v3, v[0:1] 546; GFX8-NEXT: v_mov_b32_e32 v1, s1 547; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 548; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 549; GFX8-NEXT: s_waitcnt vmcnt(0) 550; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp 551; GFX8-NEXT: flat_store_short v[0:1], v2 552; GFX8-NEXT: s_endpgm 553; 554; GFX9-LABEL: v_clamp_neg_f16: 555; GFX9: ; %bb.0: 556; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 557; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 558; GFX9-NEXT: s_waitcnt lgkmcnt(0) 559; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 560; GFX9-NEXT: s_waitcnt vmcnt(0) 561; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp 562; GFX9-NEXT: global_store_short v0, v1, s[0:1] 563; GFX9-NEXT: s_endpgm 564; 565; GFX11-LABEL: v_clamp_neg_f16: 566; GFX11: ; %bb.0: 567; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 568; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 569; GFX11-NEXT: s_waitcnt lgkmcnt(0) 570; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 571; GFX11-NEXT: s_waitcnt vmcnt(0) 572; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp 573; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 574; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 575; GFX11-NEXT: s_endpgm 576 %tid = call i32 @llvm.amdgcn.workitem.id.x() 577 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 578 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 579 %a = load half, half addrspace(1)* %gep0 580 %fneg.a = fsub half -0.0, %a 581 %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0) 582 %med = call half @llvm.minnum.f16(half %max, half 1.0) 583 584 store half %med, half addrspace(1)* %out.gep 585 ret void 586} 587 588define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 { 589; GFX6-LABEL: v_clamp_negabs_f16: 590; GFX6: ; %bb.0: 591; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 592; GFX6-NEXT: s_mov_b32 s7, 0xf000 593; GFX6-NEXT: s_mov_b32 s6, 0 594; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 595; GFX6-NEXT: v_mov_b32_e32 v1, 0 596; GFX6-NEXT: s_waitcnt lgkmcnt(0) 597; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 598; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 599; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 600; GFX6-NEXT: s_waitcnt vmcnt(0) 601; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -|v2| clamp 602; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 603; GFX6-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 604; GFX6-NEXT: s_endpgm 605; 606; GFX8-LABEL: v_clamp_negabs_f16: 607; GFX8: ; %bb.0: 608; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 609; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 610; GFX8-NEXT: s_waitcnt lgkmcnt(0) 611; GFX8-NEXT: v_mov_b32_e32 v1, s3 612; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 613; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 614; GFX8-NEXT: flat_load_ushort v3, v[0:1] 615; GFX8-NEXT: v_mov_b32_e32 v1, s1 616; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 617; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 618; GFX8-NEXT: s_waitcnt vmcnt(0) 619; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp 620; GFX8-NEXT: flat_store_short v[0:1], v2 621; GFX8-NEXT: s_endpgm 622; 623; GFX9-LABEL: v_clamp_negabs_f16: 624; GFX9: ; %bb.0: 625; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 626; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 627; GFX9-NEXT: s_waitcnt lgkmcnt(0) 628; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 629; GFX9-NEXT: s_waitcnt vmcnt(0) 630; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp 631; GFX9-NEXT: global_store_short v0, v1, s[0:1] 632; GFX9-NEXT: s_endpgm 633; 634; GFX11-LABEL: v_clamp_negabs_f16: 635; GFX11: ; %bb.0: 636; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 637; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 638; GFX11-NEXT: s_waitcnt lgkmcnt(0) 639; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 640; GFX11-NEXT: s_waitcnt vmcnt(0) 641; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp 642; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 643; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 644; GFX11-NEXT: s_endpgm 645 %tid = call i32 @llvm.amdgcn.workitem.id.x() 646 %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid 647 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid 648 %a = load half, half addrspace(1)* %gep0 649 %fabs.a = call half @llvm.fabs.f16(half %a) 650 %fneg.fabs.a = fsub half -0.0, %fabs.a 651 652 %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0) 653 %med = call half @llvm.minnum.f16(half %max, half 1.0) 654 655 store half %med, half addrspace(1)* %out.gep 656 ret void 657} 658 659define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 660; GFX6-LABEL: v_clamp_f64: 661; GFX6: ; %bb.0: 662; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 663; GFX6-NEXT: s_mov_b32 s7, 0xf000 664; GFX6-NEXT: s_mov_b32 s6, 0 665; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 666; GFX6-NEXT: v_mov_b32_e32 v1, 0 667; GFX6-NEXT: s_waitcnt lgkmcnt(0) 668; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 669; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 670; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 671; GFX6-NEXT: s_waitcnt vmcnt(0) 672; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] clamp 673; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 674; GFX6-NEXT: s_endpgm 675; 676; GFX8-LABEL: v_clamp_f64: 677; GFX8: ; %bb.0: 678; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 679; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 680; GFX8-NEXT: s_waitcnt lgkmcnt(0) 681; GFX8-NEXT: v_mov_b32_e32 v1, s3 682; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 683; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 684; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 685; GFX8-NEXT: v_mov_b32_e32 v3, s1 686; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 687; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 688; GFX8-NEXT: s_waitcnt vmcnt(0) 689; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 690; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 691; GFX8-NEXT: s_endpgm 692; 693; GFX9-LABEL: v_clamp_f64: 694; GFX9: ; %bb.0: 695; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 696; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 699; GFX9-NEXT: s_waitcnt vmcnt(0) 700; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 701; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 702; GFX9-NEXT: s_endpgm 703; 704; GFX11-LABEL: v_clamp_f64: 705; GFX11: ; %bb.0: 706; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 707; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 708; GFX11-NEXT: s_waitcnt lgkmcnt(0) 709; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 710; GFX11-NEXT: s_waitcnt vmcnt(0) 711; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp 712; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 713; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 714; GFX11-NEXT: s_endpgm 715 %tid = call i32 @llvm.amdgcn.workitem.id.x() 716 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 717 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 718 %a = load double, double addrspace(1)* %gep0 719 %max = call double @llvm.maxnum.f64(double %a, double 0.0) 720 %med = call double @llvm.minnum.f64(double %max, double 1.0) 721 722 store double %med, double addrspace(1)* %out.gep 723 ret void 724} 725 726define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 727; GFX6-LABEL: v_clamp_neg_f64: 728; GFX6: ; %bb.0: 729; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 730; GFX6-NEXT: s_mov_b32 s7, 0xf000 731; GFX6-NEXT: s_mov_b32 s6, 0 732; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 733; GFX6-NEXT: v_mov_b32_e32 v1, 0 734; GFX6-NEXT: s_waitcnt lgkmcnt(0) 735; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 736; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 737; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 738; GFX6-NEXT: s_waitcnt vmcnt(0) 739; GFX6-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] clamp 740; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 741; GFX6-NEXT: s_endpgm 742; 743; GFX8-LABEL: v_clamp_neg_f64: 744; GFX8: ; %bb.0: 745; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 746; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 747; GFX8-NEXT: s_waitcnt lgkmcnt(0) 748; GFX8-NEXT: v_mov_b32_e32 v1, s3 749; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 750; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 751; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 752; GFX8-NEXT: v_mov_b32_e32 v3, s1 753; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 754; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 755; GFX8-NEXT: s_waitcnt vmcnt(0) 756; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 757; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 758; GFX8-NEXT: s_endpgm 759; 760; GFX9-LABEL: v_clamp_neg_f64: 761; GFX9: ; %bb.0: 762; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 763; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 764; GFX9-NEXT: s_waitcnt lgkmcnt(0) 765; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 766; GFX9-NEXT: s_waitcnt vmcnt(0) 767; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 768; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 769; GFX9-NEXT: s_endpgm 770; 771; GFX11-LABEL: v_clamp_neg_f64: 772; GFX11: ; %bb.0: 773; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 774; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 775; GFX11-NEXT: s_waitcnt lgkmcnt(0) 776; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 777; GFX11-NEXT: s_waitcnt vmcnt(0) 778; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp 779; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 780; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 781; GFX11-NEXT: s_endpgm 782 %tid = call i32 @llvm.amdgcn.workitem.id.x() 783 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 784 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 785 %a = load double, double addrspace(1)* %gep0 786 %fneg.a = fsub double -0.0, %a 787 %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0) 788 %med = call double @llvm.minnum.f64(double %max, double 1.0) 789 790 store double %med, double addrspace(1)* %out.gep 791 ret void 792} 793 794define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { 795; GFX6-LABEL: v_clamp_negabs_f64: 796; GFX6: ; %bb.0: 797; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 798; GFX6-NEXT: s_mov_b32 s7, 0xf000 799; GFX6-NEXT: s_mov_b32 s6, 0 800; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 801; GFX6-NEXT: v_mov_b32_e32 v1, 0 802; GFX6-NEXT: s_waitcnt lgkmcnt(0) 803; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 804; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 805; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 806; GFX6-NEXT: s_waitcnt vmcnt(0) 807; GFX6-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| clamp 808; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 809; GFX6-NEXT: s_endpgm 810; 811; GFX8-LABEL: v_clamp_negabs_f64: 812; GFX8: ; %bb.0: 813; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 814; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 815; GFX8-NEXT: s_waitcnt lgkmcnt(0) 816; GFX8-NEXT: v_mov_b32_e32 v1, s3 817; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 818; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 819; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 820; GFX8-NEXT: v_mov_b32_e32 v3, s1 821; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 822; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 823; GFX8-NEXT: s_waitcnt vmcnt(0) 824; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 825; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 826; GFX8-NEXT: s_endpgm 827; 828; GFX9-LABEL: v_clamp_negabs_f64: 829; GFX9: ; %bb.0: 830; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 831; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 833; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 834; GFX9-NEXT: s_waitcnt vmcnt(0) 835; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 836; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 837; GFX9-NEXT: s_endpgm 838; 839; GFX11-LABEL: v_clamp_negabs_f64: 840; GFX11: ; %bb.0: 841; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 842; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 843; GFX11-NEXT: s_waitcnt lgkmcnt(0) 844; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 845; GFX11-NEXT: s_waitcnt vmcnt(0) 846; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp 847; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 848; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 849; GFX11-NEXT: s_endpgm 850 %tid = call i32 @llvm.amdgcn.workitem.id.x() 851 %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid 852 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 853 %a = load double, double addrspace(1)* %gep0 854 %fabs.a = call double @llvm.fabs.f64(double %a) 855 %fneg.fabs.a = fsub double -0.0, %fabs.a 856 857 %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0) 858 %med = call double @llvm.minnum.f64(double %max, double 1.0) 859 860 store double %med, double addrspace(1)* %out.gep 861 ret void 862} 863 864define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 865; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: 866; GFX6: ; %bb.0: 867; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 868; GFX6-NEXT: s_mov_b32 s7, 0xf000 869; GFX6-NEXT: s_mov_b32 s6, 0 870; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 871; GFX6-NEXT: v_mov_b32_e32 v1, 0 872; GFX6-NEXT: s_waitcnt lgkmcnt(0) 873; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 874; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 875; GFX6-NEXT: s_brev_b32 s4, 1 876; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 877; GFX6-NEXT: s_waitcnt vmcnt(0) 878; GFX6-NEXT: v_med3_f32 v2, s4, 1.0, v2 879; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 880; GFX6-NEXT: s_endpgm 881; 882; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: 883; GFX8: ; %bb.0: 884; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 885; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 886; GFX8-NEXT: s_waitcnt lgkmcnt(0) 887; GFX8-NEXT: v_mov_b32_e32 v1, s3 888; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 889; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 890; GFX8-NEXT: flat_load_dword v3, v[0:1] 891; GFX8-NEXT: v_mov_b32_e32 v1, s1 892; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 893; GFX8-NEXT: s_brev_b32 s0, 1 894; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 895; GFX8-NEXT: s_waitcnt vmcnt(0) 896; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3 897; GFX8-NEXT: flat_store_dword v[0:1], v2 898; GFX8-NEXT: s_endpgm 899; 900; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: 901; GFX9: ; %bb.0: 902; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 903; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 904; GFX9-NEXT: s_waitcnt lgkmcnt(0) 905; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 906; GFX9-NEXT: s_brev_b32 s2, 1 907; GFX9-NEXT: s_waitcnt vmcnt(0) 908; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1 909; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 910; GFX9-NEXT: s_endpgm 911; 912; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: 913; GFX11: ; %bb.0: 914; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 915; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 916; GFX11-NEXT: s_waitcnt lgkmcnt(0) 917; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 918; GFX11-NEXT: s_waitcnt vmcnt(0) 919; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1 920; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 921; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 922; GFX11-NEXT: s_endpgm 923 %tid = call i32 @llvm.amdgcn.workitem.id.x() 924 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 925 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 926 %a = load float, float addrspace(1)* %gep0 927 %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a) 928 store float %med, float addrspace(1)* %out.gep 929 ret void 930} 931 932define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 933; GFX6-LABEL: v_clamp_med3_aby_f32: 934; GFX6: ; %bb.0: 935; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 936; GFX6-NEXT: s_mov_b32 s7, 0xf000 937; GFX6-NEXT: s_mov_b32 s6, 0 938; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 939; GFX6-NEXT: v_mov_b32_e32 v1, 0 940; GFX6-NEXT: s_waitcnt lgkmcnt(0) 941; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 942; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 943; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 944; GFX6-NEXT: s_waitcnt vmcnt(0) 945; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 946; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 947; GFX6-NEXT: s_endpgm 948; 949; GFX8-LABEL: v_clamp_med3_aby_f32: 950; GFX8: ; %bb.0: 951; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 952; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 954; GFX8-NEXT: v_mov_b32_e32 v1, s3 955; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 956; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 957; GFX8-NEXT: flat_load_dword v3, v[0:1] 958; GFX8-NEXT: v_mov_b32_e32 v1, s1 959; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 960; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 961; GFX8-NEXT: s_waitcnt vmcnt(0) 962; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 963; GFX8-NEXT: flat_store_dword v[0:1], v2 964; GFX8-NEXT: s_endpgm 965; 966; GFX9-LABEL: v_clamp_med3_aby_f32: 967; GFX9: ; %bb.0: 968; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 969; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 971; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 972; GFX9-NEXT: s_waitcnt vmcnt(0) 973; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 974; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 975; GFX9-NEXT: s_endpgm 976; 977; GFX11-LABEL: v_clamp_med3_aby_f32: 978; GFX11: ; %bb.0: 979; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 980; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 981; GFX11-NEXT: s_waitcnt lgkmcnt(0) 982; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 983; GFX11-NEXT: s_waitcnt vmcnt(0) 984; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 985; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 986; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 987; GFX11-NEXT: s_endpgm 988 %tid = call i32 @llvm.amdgcn.workitem.id.x() 989 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 990 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 991 %a = load float, float addrspace(1)* %gep0 992 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 993 store float %med, float addrspace(1)* %out.gep 994 ret void 995} 996 997define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 998; GFX6-LABEL: v_clamp_med3_bay_f32: 999; GFX6: ; %bb.0: 1000; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1001; GFX6-NEXT: s_mov_b32 s7, 0xf000 1002; GFX6-NEXT: s_mov_b32 s6, 0 1003; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1004; GFX6-NEXT: v_mov_b32_e32 v1, 0 1005; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1007; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1008; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1009; GFX6-NEXT: s_waitcnt vmcnt(0) 1010; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1011; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1012; GFX6-NEXT: s_endpgm 1013; 1014; GFX8-LABEL: v_clamp_med3_bay_f32: 1015; GFX8: ; %bb.0: 1016; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1017; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1018; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX8-NEXT: v_mov_b32_e32 v1, s3 1020; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1021; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1022; GFX8-NEXT: flat_load_dword v3, v[0:1] 1023; GFX8-NEXT: v_mov_b32_e32 v1, s1 1024; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1025; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1026; GFX8-NEXT: s_waitcnt vmcnt(0) 1027; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1028; GFX8-NEXT: flat_store_dword v[0:1], v2 1029; GFX8-NEXT: s_endpgm 1030; 1031; GFX9-LABEL: v_clamp_med3_bay_f32: 1032; GFX9: ; %bb.0: 1033; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1034; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1035; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1037; GFX9-NEXT: s_waitcnt vmcnt(0) 1038; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1039; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1040; GFX9-NEXT: s_endpgm 1041; 1042; GFX11-LABEL: v_clamp_med3_bay_f32: 1043; GFX11: ; %bb.0: 1044; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1045; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1046; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1047; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1048; GFX11-NEXT: s_waitcnt vmcnt(0) 1049; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1050; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1051; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1052; GFX11-NEXT: s_endpgm 1053 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1054 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1055 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1056 %a = load float, float addrspace(1)* %gep0 1057 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 1058 store float %med, float addrspace(1)* %out.gep 1059 ret void 1060} 1061 1062define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 1063; GFX6-LABEL: v_clamp_med3_yab_f32: 1064; GFX6: ; %bb.0: 1065; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1066; GFX6-NEXT: s_mov_b32 s7, 0xf000 1067; GFX6-NEXT: s_mov_b32 s6, 0 1068; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1069; GFX6-NEXT: v_mov_b32_e32 v1, 0 1070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1072; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1073; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1074; GFX6-NEXT: s_waitcnt vmcnt(0) 1075; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1076; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1077; GFX6-NEXT: s_endpgm 1078; 1079; GFX8-LABEL: v_clamp_med3_yab_f32: 1080; GFX8: ; %bb.0: 1081; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1082; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1083; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX8-NEXT: v_mov_b32_e32 v1, s3 1085; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1086; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1087; GFX8-NEXT: flat_load_dword v3, v[0:1] 1088; GFX8-NEXT: v_mov_b32_e32 v1, s1 1089; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1090; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1091; GFX8-NEXT: s_waitcnt vmcnt(0) 1092; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1093; GFX8-NEXT: flat_store_dword v[0:1], v2 1094; GFX8-NEXT: s_endpgm 1095; 1096; GFX9-LABEL: v_clamp_med3_yab_f32: 1097; GFX9: ; %bb.0: 1098; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1099; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1100; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1102; GFX9-NEXT: s_waitcnt vmcnt(0) 1103; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1104; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1105; GFX9-NEXT: s_endpgm 1106; 1107; GFX11-LABEL: v_clamp_med3_yab_f32: 1108; GFX11: ; %bb.0: 1109; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1110; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1111; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1113; GFX11-NEXT: s_waitcnt vmcnt(0) 1114; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1115; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1116; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1117; GFX11-NEXT: s_endpgm 1118 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1119 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1120 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1121 %a = load float, float addrspace(1)* %gep0 1122 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 1123 store float %med, float addrspace(1)* %out.gep 1124 ret void 1125} 1126 1127define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 1128; GFX6-LABEL: v_clamp_med3_yba_f32: 1129; GFX6: ; %bb.0: 1130; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1131; GFX6-NEXT: s_mov_b32 s7, 0xf000 1132; GFX6-NEXT: s_mov_b32 s6, 0 1133; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1134; GFX6-NEXT: v_mov_b32_e32 v1, 0 1135; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1137; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1138; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1139; GFX6-NEXT: s_waitcnt vmcnt(0) 1140; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1141; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1142; GFX6-NEXT: s_endpgm 1143; 1144; GFX8-LABEL: v_clamp_med3_yba_f32: 1145; GFX8: ; %bb.0: 1146; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1147; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1148; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX8-NEXT: v_mov_b32_e32 v1, s3 1150; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1151; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1152; GFX8-NEXT: flat_load_dword v3, v[0:1] 1153; GFX8-NEXT: v_mov_b32_e32 v1, s1 1154; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1155; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1156; GFX8-NEXT: s_waitcnt vmcnt(0) 1157; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1158; GFX8-NEXT: flat_store_dword v[0:1], v2 1159; GFX8-NEXT: s_endpgm 1160; 1161; GFX9-LABEL: v_clamp_med3_yba_f32: 1162; GFX9: ; %bb.0: 1163; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1164; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1167; GFX9-NEXT: s_waitcnt vmcnt(0) 1168; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1169; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1170; GFX9-NEXT: s_endpgm 1171; 1172; GFX11-LABEL: v_clamp_med3_yba_f32: 1173; GFX11: ; %bb.0: 1174; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1175; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1176; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1178; GFX11-NEXT: s_waitcnt vmcnt(0) 1179; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1180; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1181; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1182; GFX11-NEXT: s_endpgm 1183 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1184 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1185 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1186 %a = load float, float addrspace(1)* %gep0 1187 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 1188 store float %med, float addrspace(1)* %out.gep 1189 ret void 1190} 1191 1192define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 1193; GFX6-LABEL: v_clamp_med3_ayb_f32: 1194; GFX6: ; %bb.0: 1195; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1196; GFX6-NEXT: s_mov_b32 s7, 0xf000 1197; GFX6-NEXT: s_mov_b32 s6, 0 1198; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1199; GFX6-NEXT: v_mov_b32_e32 v1, 0 1200; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1202; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1203; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1204; GFX6-NEXT: s_waitcnt vmcnt(0) 1205; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1206; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1207; GFX6-NEXT: s_endpgm 1208; 1209; GFX8-LABEL: v_clamp_med3_ayb_f32: 1210; GFX8: ; %bb.0: 1211; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1212; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1213; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX8-NEXT: v_mov_b32_e32 v1, s3 1215; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1216; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1217; GFX8-NEXT: flat_load_dword v3, v[0:1] 1218; GFX8-NEXT: v_mov_b32_e32 v1, s1 1219; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1220; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1221; GFX8-NEXT: s_waitcnt vmcnt(0) 1222; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1223; GFX8-NEXT: flat_store_dword v[0:1], v2 1224; GFX8-NEXT: s_endpgm 1225; 1226; GFX9-LABEL: v_clamp_med3_ayb_f32: 1227; GFX9: ; %bb.0: 1228; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1229; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1230; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1232; GFX9-NEXT: s_waitcnt vmcnt(0) 1233; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1234; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1235; GFX9-NEXT: s_endpgm 1236; 1237; GFX11-LABEL: v_clamp_med3_ayb_f32: 1238; GFX11: ; %bb.0: 1239; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1240; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1241; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1243; GFX11-NEXT: s_waitcnt vmcnt(0) 1244; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1245; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1246; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1247; GFX11-NEXT: s_endpgm 1248 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1249 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1250 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1251 %a = load float, float addrspace(1)* %gep0 1252 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 1253 store float %med, float addrspace(1)* %out.gep 1254 ret void 1255} 1256 1257define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { 1258; GFX6-LABEL: v_clamp_med3_bya_f32: 1259; GFX6: ; %bb.0: 1260; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1261; GFX6-NEXT: s_mov_b32 s7, 0xf000 1262; GFX6-NEXT: s_mov_b32 s6, 0 1263; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1264; GFX6-NEXT: v_mov_b32_e32 v1, 0 1265; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1267; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1268; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1269; GFX6-NEXT: s_waitcnt vmcnt(0) 1270; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1271; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1272; GFX6-NEXT: s_endpgm 1273; 1274; GFX8-LABEL: v_clamp_med3_bya_f32: 1275; GFX8: ; %bb.0: 1276; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1277; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1278; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX8-NEXT: v_mov_b32_e32 v1, s3 1280; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1281; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1282; GFX8-NEXT: flat_load_dword v3, v[0:1] 1283; GFX8-NEXT: v_mov_b32_e32 v1, s1 1284; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1285; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1286; GFX8-NEXT: s_waitcnt vmcnt(0) 1287; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1288; GFX8-NEXT: flat_store_dword v[0:1], v2 1289; GFX8-NEXT: s_endpgm 1290; 1291; GFX9-LABEL: v_clamp_med3_bya_f32: 1292; GFX9: ; %bb.0: 1293; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1294; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1297; GFX9-NEXT: s_waitcnt vmcnt(0) 1298; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1299; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1300; GFX9-NEXT: s_endpgm 1301; 1302; GFX11-LABEL: v_clamp_med3_bya_f32: 1303; GFX11: ; %bb.0: 1304; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1305; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1306; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1307; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1308; GFX11-NEXT: s_waitcnt vmcnt(0) 1309; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1310; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1311; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1312; GFX11-NEXT: s_endpgm 1313 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1314 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1315 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1316 %a = load float, float addrspace(1)* %gep0 1317 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 1318 store float %med, float addrspace(1)* %out.gep 1319 ret void 1320} 1321 1322define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 { 1323; GFX6-LABEL: v_clamp_constants_to_one_f32: 1324; GFX6: ; %bb.0: 1325; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1326; GFX6-NEXT: s_mov_b32 s3, 0xf000 1327; GFX6-NEXT: s_mov_b32 s2, 0 1328; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1329; GFX6-NEXT: v_mov_b32_e32 v1, 0 1330; GFX6-NEXT: v_mov_b32_e32 v2, 1.0 1331; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1333; GFX6-NEXT: s_endpgm 1334; 1335; GFX8-LABEL: v_clamp_constants_to_one_f32: 1336; GFX8: ; %bb.0: 1337; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1338; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1339; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 1340; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX8-NEXT: v_mov_b32_e32 v1, s1 1342; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1343; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1344; GFX8-NEXT: flat_store_dword v[0:1], v2 1345; GFX8-NEXT: s_endpgm 1346; 1347; GFX9-LABEL: v_clamp_constants_to_one_f32: 1348; GFX9: ; %bb.0: 1349; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1350; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1351; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 1352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1354; GFX9-NEXT: s_endpgm 1355; 1356; GFX11-LABEL: v_clamp_constants_to_one_f32: 1357; GFX11: ; %bb.0: 1358; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1359; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 1360; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1361; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1362; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1363; GFX11-NEXT: s_endpgm 1364 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1365 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1366 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0) 1367 store float %med, float addrspace(1)* %out.gep 1368 ret void 1369} 1370 1371define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 { 1372; GFX6-LABEL: v_clamp_constants_to_zero_f32: 1373; GFX6: ; %bb.0: 1374; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1375; GFX6-NEXT: s_mov_b32 s3, 0xf000 1376; GFX6-NEXT: s_mov_b32 s2, 0 1377; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1378; GFX6-NEXT: v_mov_b32_e32 v1, 0 1379; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1381; GFX6-NEXT: s_endpgm 1382; 1383; GFX8-LABEL: v_clamp_constants_to_zero_f32: 1384; GFX8: ; %bb.0: 1385; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1386; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1387; GFX8-NEXT: v_mov_b32_e32 v2, 0 1388; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX8-NEXT: v_mov_b32_e32 v1, s1 1390; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1391; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1392; GFX8-NEXT: flat_store_dword v[0:1], v2 1393; GFX8-NEXT: s_endpgm 1394; 1395; GFX9-LABEL: v_clamp_constants_to_zero_f32: 1396; GFX9: ; %bb.0: 1397; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1398; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1399; GFX9-NEXT: v_mov_b32_e32 v1, 0 1400; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1401; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1402; GFX9-NEXT: s_endpgm 1403; 1404; GFX11-LABEL: v_clamp_constants_to_zero_f32: 1405; GFX11: ; %bb.0: 1406; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1407; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1408; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1410; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1411; GFX11-NEXT: s_endpgm 1412 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1413 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1414 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0) 1415 store float %med, float addrspace(1)* %out.gep 1416 ret void 1417} 1418 1419define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 { 1420; GFX6-LABEL: v_clamp_constant_preserve_f32: 1421; GFX6: ; %bb.0: 1422; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1423; GFX6-NEXT: s_mov_b32 s3, 0xf000 1424; GFX6-NEXT: s_mov_b32 s2, 0 1425; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1426; GFX6-NEXT: v_mov_b32_e32 v1, 0 1427; GFX6-NEXT: v_mov_b32_e32 v2, 0.5 1428; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1429; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1430; GFX6-NEXT: s_endpgm 1431; 1432; GFX8-LABEL: v_clamp_constant_preserve_f32: 1433; GFX8: ; %bb.0: 1434; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1435; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1436; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 1437; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1438; GFX8-NEXT: v_mov_b32_e32 v1, s1 1439; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1440; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1441; GFX8-NEXT: flat_store_dword v[0:1], v2 1442; GFX8-NEXT: s_endpgm 1443; 1444; GFX9-LABEL: v_clamp_constant_preserve_f32: 1445; GFX9: ; %bb.0: 1446; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1447; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1448; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 1449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1450; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1451; GFX9-NEXT: s_endpgm 1452; 1453; GFX11-LABEL: v_clamp_constant_preserve_f32: 1454; GFX11: ; %bb.0: 1455; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1456; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 1457; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1459; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1460; GFX11-NEXT: s_endpgm 1461 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1462 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1463 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5) 1464 store float %med, float addrspace(1)* %out.gep 1465 ret void 1466} 1467 1468define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 { 1469; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: 1470; GFX6: ; %bb.0: 1471; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1472; GFX6-NEXT: s_mov_b32 s3, 0xf000 1473; GFX6-NEXT: s_mov_b32 s2, 0 1474; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1475; GFX6-NEXT: v_mov_b32_e32 v1, 0 1476; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fffff 1477; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1479; GFX6-NEXT: s_endpgm 1480; 1481; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: 1482; GFX8: ; %bb.0: 1483; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1484; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1485; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff 1486; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX8-NEXT: v_mov_b32_e32 v1, s1 1488; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1489; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1490; GFX8-NEXT: flat_store_dword v[0:1], v2 1491; GFX8-NEXT: s_endpgm 1492; 1493; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: 1494; GFX9: ; %bb.0: 1495; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1496; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1497; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff 1498; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1500; GFX9-NEXT: s_endpgm 1501; 1502; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: 1503; GFX11: ; %bb.0: 1504; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1505; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 1506; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1508; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1509; GFX11-NEXT: s_endpgm 1510 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1511 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1512 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float)) 1513 store float %med, float addrspace(1)* %out.gep 1514 ret void 1515} 1516 1517define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 { 1518; GFX6-LABEL: v_clamp_constant_qnan_f32: 1519; GFX6: ; %bb.0: 1520; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1521; GFX6-NEXT: s_mov_b32 s3, 0xf000 1522; GFX6-NEXT: s_mov_b32 s2, 0 1523; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1524; GFX6-NEXT: v_mov_b32_e32 v1, 0 1525; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1526; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1527; GFX6-NEXT: s_endpgm 1528; 1529; GFX8-LABEL: v_clamp_constant_qnan_f32: 1530; GFX8: ; %bb.0: 1531; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1532; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1533; GFX8-NEXT: v_mov_b32_e32 v2, 0 1534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX8-NEXT: v_mov_b32_e32 v1, s1 1536; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1537; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1538; GFX8-NEXT: flat_store_dword v[0:1], v2 1539; GFX8-NEXT: s_endpgm 1540; 1541; GFX9-LABEL: v_clamp_constant_qnan_f32: 1542; GFX9: ; %bb.0: 1543; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1544; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1545; GFX9-NEXT: v_mov_b32_e32 v1, 0 1546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1548; GFX9-NEXT: s_endpgm 1549; 1550; GFX11-LABEL: v_clamp_constant_qnan_f32: 1551; GFX11: ; %bb.0: 1552; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1553; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1554; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1555; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1556; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1557; GFX11-NEXT: s_endpgm 1558 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1559 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1560 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 1561 store float %med, float addrspace(1)* %out.gep 1562 ret void 1563} 1564 1565define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 { 1566; GFX6-LABEL: v_clamp_constant_snan_f32: 1567; GFX6: ; %bb.0: 1568; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1569; GFX6-NEXT: s_mov_b32 s3, 0xf000 1570; GFX6-NEXT: s_mov_b32 s2, 0 1571; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1572; GFX6-NEXT: v_mov_b32_e32 v1, 0 1573; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX6-NEXT: buffer_store_dword v1, v[0:1], s[0:3], 0 addr64 1575; GFX6-NEXT: s_endpgm 1576; 1577; GFX8-LABEL: v_clamp_constant_snan_f32: 1578; GFX8: ; %bb.0: 1579; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1580; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1581; GFX8-NEXT: v_mov_b32_e32 v2, 0 1582; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX8-NEXT: v_mov_b32_e32 v1, s1 1584; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1585; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1586; GFX8-NEXT: flat_store_dword v[0:1], v2 1587; GFX8-NEXT: s_endpgm 1588; 1589; GFX9-LABEL: v_clamp_constant_snan_f32: 1590; GFX9: ; %bb.0: 1591; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1592; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1593; GFX9-NEXT: v_mov_b32_e32 v1, 0 1594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1595; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1596; GFX9-NEXT: s_endpgm 1597; 1598; GFX11-LABEL: v_clamp_constant_snan_f32: 1599; GFX11: ; %bb.0: 1600; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1601; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 1602; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1604; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1605; GFX11-NEXT: s_endpgm 1606 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1607 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1608 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 1609 store float %med, float addrspace(1)* %out.gep 1610 ret void 1611} 1612 1613; --------------------------------------------------------------------- 1614; Test non-default behaviors enabling snans and disabling dx10_clamp 1615; --------------------------------------------------------------------- 1616 1617define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 1618; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: 1619; GFX6: ; %bb.0: 1620; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1621; GFX6-NEXT: s_mov_b32 s7, 0xf000 1622; GFX6-NEXT: s_mov_b32 s6, 0 1623; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1624; GFX6-NEXT: v_mov_b32_e32 v1, 0 1625; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1627; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1628; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1629; GFX6-NEXT: s_waitcnt vmcnt(0) 1630; GFX6-NEXT: v_add_f32_e32 v2, 0.5, v2 1631; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 1632; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1633; GFX6-NEXT: s_endpgm 1634; 1635; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: 1636; GFX8: ; %bb.0: 1637; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1638; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1639; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1640; GFX8-NEXT: v_mov_b32_e32 v1, s3 1641; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1642; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1643; GFX8-NEXT: flat_load_dword v3, v[0:1] 1644; GFX8-NEXT: v_mov_b32_e32 v1, s1 1645; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1646; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1647; GFX8-NEXT: s_waitcnt vmcnt(0) 1648; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3 1649; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 1650; GFX8-NEXT: flat_store_dword v[0:1], v2 1651; GFX8-NEXT: s_endpgm 1652; 1653; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: 1654; GFX9: ; %bb.0: 1655; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1656; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1657; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1658; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1659; GFX9-NEXT: s_waitcnt vmcnt(0) 1660; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 1661; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 1662; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1663; GFX9-NEXT: s_endpgm 1664; 1665; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: 1666; GFX11: ; %bb.0: 1667; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1668; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1669; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1671; GFX11-NEXT: s_waitcnt vmcnt(0) 1672; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 1673; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1674; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 1675; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1676; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1677; GFX11-NEXT: s_endpgm 1678 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1679 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1680 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1681 %a = load float, float addrspace(1)* %gep0 1682 %a.nnan = fadd nnan float %a, 0.5 1683 %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0) 1684 %med = call float @llvm.minnum.f32(float %max, float 1.0) 1685 1686 store float %med, float addrspace(1)* %out.gep 1687 ret void 1688} 1689 1690define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 { 1691; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: 1692; GFX6: ; %bb.0: 1693; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1694; GFX6-NEXT: s_mov_b32 s7, 0xf000 1695; GFX6-NEXT: s_mov_b32 s6, 0 1696; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1697; GFX6-NEXT: v_mov_b32_e32 v1, 0 1698; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1700; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1701; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1702; GFX6-NEXT: s_waitcnt vmcnt(0) 1703; GFX6-NEXT: v_add_f32_e64 v2, v2, 0.5 clamp 1704; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1705; GFX6-NEXT: s_endpgm 1706; 1707; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: 1708; GFX8: ; %bb.0: 1709; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1710; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1711; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1712; GFX8-NEXT: v_mov_b32_e32 v1, s3 1713; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1714; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1715; GFX8-NEXT: flat_load_dword v3, v[0:1] 1716; GFX8-NEXT: v_mov_b32_e32 v1, s1 1717; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1718; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1719; GFX8-NEXT: s_waitcnt vmcnt(0) 1720; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp 1721; GFX8-NEXT: flat_store_dword v[0:1], v2 1722; GFX8-NEXT: s_endpgm 1723; 1724; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: 1725; GFX9: ; %bb.0: 1726; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1727; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1728; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1729; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1730; GFX9-NEXT: s_waitcnt vmcnt(0) 1731; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 1732; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1733; GFX9-NEXT: s_endpgm 1734; 1735; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: 1736; GFX11: ; %bb.0: 1737; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1738; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1739; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1741; GFX11-NEXT: s_waitcnt vmcnt(0) 1742; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp 1743; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1744; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1745; GFX11-NEXT: s_endpgm 1746 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1747 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1748 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1749 %a = load float, float addrspace(1)* %gep0 1750 %add = fadd float %a, 0.5 1751 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 1752 %med = call float @llvm.minnum.f32(float %max, float 1.0) 1753 1754 store float %med, float addrspace(1)* %out.gep 1755 ret void 1756} 1757 1758define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 1759; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: 1760; GFX6: ; %bb.0: 1761; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1762; GFX6-NEXT: s_mov_b32 s7, 0xf000 1763; GFX6-NEXT: s_mov_b32 s6, 0 1764; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1765; GFX6-NEXT: v_mov_b32_e32 v1, 0 1766; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1767; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1768; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1769; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1770; GFX6-NEXT: s_waitcnt vmcnt(0) 1771; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 1772; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 1773; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1774; GFX6-NEXT: s_endpgm 1775; 1776; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: 1777; GFX8: ; %bb.0: 1778; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1779; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1780; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1781; GFX8-NEXT: v_mov_b32_e32 v1, s3 1782; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1783; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1784; GFX8-NEXT: flat_load_dword v3, v[0:1] 1785; GFX8-NEXT: v_mov_b32_e32 v1, s1 1786; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1787; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1788; GFX8-NEXT: s_waitcnt vmcnt(0) 1789; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 1790; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 1791; GFX8-NEXT: flat_store_dword v[0:1], v2 1792; GFX8-NEXT: s_endpgm 1793; 1794; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: 1795; GFX9: ; %bb.0: 1796; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1797; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1798; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1800; GFX9-NEXT: s_waitcnt vmcnt(0) 1801; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 1802; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 1803; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1804; GFX9-NEXT: s_endpgm 1805; 1806; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: 1807; GFX11: ; %bb.0: 1808; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1809; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1810; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1811; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1812; GFX11-NEXT: s_waitcnt vmcnt(0) 1813; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 1814; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1815; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 1816; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1817; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1818; GFX11-NEXT: s_endpgm 1819 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1820 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1821 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1822 %a = load float, float addrspace(1)* %gep0 1823 %max = call float @llvm.maxnum.f32(float %a, float 0.0) 1824 %med = call float @llvm.minnum.f32(float %max, float 1.0) 1825 1826 store float %med, float addrspace(1)* %out.gep 1827 ret void 1828} 1829 1830define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { 1831; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 1832; GFX6: ; %bb.0: 1833; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1834; GFX6-NEXT: s_mov_b32 s7, 0xf000 1835; GFX6-NEXT: s_mov_b32 s6, 0 1836; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1837; GFX6-NEXT: v_mov_b32_e32 v1, 0 1838; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1839; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1840; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1841; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1842; GFX6-NEXT: s_waitcnt vmcnt(0) 1843; GFX6-NEXT: v_add_f32_e32 v2, 1.0, v2 1844; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 1845; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1846; GFX6-NEXT: s_endpgm 1847; 1848; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 1849; GFX8: ; %bb.0: 1850; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1851; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1852; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX8-NEXT: v_mov_b32_e32 v1, s3 1854; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1855; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1856; GFX8-NEXT: flat_load_dword v3, v[0:1] 1857; GFX8-NEXT: v_mov_b32_e32 v1, s1 1858; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1859; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1860; GFX8-NEXT: s_waitcnt vmcnt(0) 1861; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3 1862; GFX8-NEXT: v_med3_f32 v2, v2, 0, 1.0 1863; GFX8-NEXT: flat_store_dword v[0:1], v2 1864; GFX8-NEXT: s_endpgm 1865; 1866; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 1867; GFX9: ; %bb.0: 1868; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1869; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1870; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1872; GFX9-NEXT: s_waitcnt vmcnt(0) 1873; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 1874; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 1875; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1876; GFX9-NEXT: s_endpgm 1877; 1878; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: 1879; GFX11: ; %bb.0: 1880; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1881; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1882; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1884; GFX11-NEXT: s_waitcnt vmcnt(0) 1885; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 1886; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1887; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 1888; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1889; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1890; GFX11-NEXT: s_endpgm 1891 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1892 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1893 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1894 %a = load float, float addrspace(1)* %gep0 1895 %add = fadd nnan float %a, 1.0 1896 %max = call float @llvm.maxnum.f32(float %add, float 0.0) 1897 %med = call float @llvm.minnum.f32(float %max, float 1.0) 1898 1899 store float %med, float addrspace(1)* %out.gep 1900 ret void 1901} 1902 1903define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 1904; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 1905; GFX6: ; %bb.0: 1906; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1907; GFX6-NEXT: s_mov_b32 s7, 0xf000 1908; GFX6-NEXT: s_mov_b32 s6, 0 1909; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1910; GFX6-NEXT: v_mov_b32_e32 v1, 0 1911; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1912; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1913; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1914; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1915; GFX6-NEXT: s_waitcnt vmcnt(0) 1916; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1917; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1918; GFX6-NEXT: s_endpgm 1919; 1920; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 1921; GFX8: ; %bb.0: 1922; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1923; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1924; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX8-NEXT: v_mov_b32_e32 v1, s3 1926; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1927; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1928; GFX8-NEXT: flat_load_dword v3, v[0:1] 1929; GFX8-NEXT: v_mov_b32_e32 v1, s1 1930; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1931; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1932; GFX8-NEXT: s_waitcnt vmcnt(0) 1933; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1934; GFX8-NEXT: flat_store_dword v[0:1], v2 1935; GFX8-NEXT: s_endpgm 1936; 1937; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 1938; GFX9: ; %bb.0: 1939; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1940; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1941; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1942; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1943; GFX9-NEXT: s_waitcnt vmcnt(0) 1944; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1945; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1946; GFX9-NEXT: s_endpgm 1947; 1948; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: 1949; GFX11: ; %bb.0: 1950; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1951; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1952; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1953; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1954; GFX11-NEXT: s_waitcnt vmcnt(0) 1955; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 1956; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1957; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1958; GFX11-NEXT: s_endpgm 1959 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1960 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 1961 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 1962 %a = load float, float addrspace(1)* %gep0 1963 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a) 1964 store float %med, float addrspace(1)* %out.gep 1965 ret void 1966} 1967 1968define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 1969; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 1970; GFX6: ; %bb.0: 1971; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1972; GFX6-NEXT: s_mov_b32 s7, 0xf000 1973; GFX6-NEXT: s_mov_b32 s6, 0 1974; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1975; GFX6-NEXT: v_mov_b32_e32 v1, 0 1976; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1977; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 1978; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1979; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 1980; GFX6-NEXT: s_waitcnt vmcnt(0) 1981; GFX6-NEXT: v_max_f32_e64 v2, v2, v2 clamp 1982; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1983; GFX6-NEXT: s_endpgm 1984; 1985; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 1986; GFX8: ; %bb.0: 1987; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1988; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX8-NEXT: v_mov_b32_e32 v1, s3 1991; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1992; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1993; GFX8-NEXT: flat_load_dword v3, v[0:1] 1994; GFX8-NEXT: v_mov_b32_e32 v1, s1 1995; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1996; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1997; GFX8-NEXT: s_waitcnt vmcnt(0) 1998; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp 1999; GFX8-NEXT: flat_store_dword v[0:1], v2 2000; GFX8-NEXT: s_endpgm 2001; 2002; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2003; GFX9: ; %bb.0: 2004; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2005; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2006; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2008; GFX9-NEXT: s_waitcnt vmcnt(0) 2009; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2010; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2011; GFX9-NEXT: s_endpgm 2012; 2013; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: 2014; GFX11: ; %bb.0: 2015; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2016; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2017; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2018; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2019; GFX11-NEXT: s_waitcnt vmcnt(0) 2020; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp 2021; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2022; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2023; GFX11-NEXT: s_endpgm 2024 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2025 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 2026 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2027 %a = load float, float addrspace(1)* %gep0 2028 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a) 2029 store float %med, float addrspace(1)* %out.gep 2030 ret void 2031} 2032 2033define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 2034; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2035; GFX6: ; %bb.0: 2036; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2037; GFX6-NEXT: s_mov_b32 s7, 0xf000 2038; GFX6-NEXT: s_mov_b32 s6, 0 2039; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2040; GFX6-NEXT: v_mov_b32_e32 v1, 0 2041; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2042; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2043; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2044; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2045; GFX6-NEXT: s_waitcnt vmcnt(0) 2046; GFX6-NEXT: v_med3_f32 v2, v2, 0, 1.0 2047; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2048; GFX6-NEXT: s_endpgm 2049; 2050; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2051; GFX8: ; %bb.0: 2052; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2053; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2054; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2055; GFX8-NEXT: v_mov_b32_e32 v1, s3 2056; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2057; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2058; GFX8-NEXT: flat_load_dword v3, v[0:1] 2059; GFX8-NEXT: v_mov_b32_e32 v1, s1 2060; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2061; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2062; GFX8-NEXT: s_waitcnt vmcnt(0) 2063; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0 2064; GFX8-NEXT: flat_store_dword v[0:1], v2 2065; GFX8-NEXT: s_endpgm 2066; 2067; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2068; GFX9: ; %bb.0: 2069; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2070; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2071; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2072; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2073; GFX9-NEXT: s_waitcnt vmcnt(0) 2074; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 2075; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2076; GFX9-NEXT: s_endpgm 2077; 2078; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: 2079; GFX11: ; %bb.0: 2080; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2081; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2082; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2083; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2084; GFX11-NEXT: s_waitcnt vmcnt(0) 2085; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 2086; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2087; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2088; GFX11-NEXT: s_endpgm 2089 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2090 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 2091 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2092 %a = load float, float addrspace(1)* %gep0 2093 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0) 2094 store float %med, float addrspace(1)* %out.gep 2095 ret void 2096} 2097 2098define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 2099; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2100; GFX6: ; %bb.0: 2101; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2102; GFX6-NEXT: s_mov_b32 s7, 0xf000 2103; GFX6-NEXT: s_mov_b32 s6, 0 2104; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2105; GFX6-NEXT: v_mov_b32_e32 v1, 0 2106; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2107; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2108; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2109; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2110; GFX6-NEXT: s_waitcnt vmcnt(0) 2111; GFX6-NEXT: v_med3_f32 v2, v2, 1.0, 0 2112; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2113; GFX6-NEXT: s_endpgm 2114; 2115; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2116; GFX8: ; %bb.0: 2117; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2118; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2119; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX8-NEXT: v_mov_b32_e32 v1, s3 2121; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2122; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2123; GFX8-NEXT: flat_load_dword v3, v[0:1] 2124; GFX8-NEXT: v_mov_b32_e32 v1, s1 2125; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2126; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2127; GFX8-NEXT: s_waitcnt vmcnt(0) 2128; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0 2129; GFX8-NEXT: flat_store_dword v[0:1], v2 2130; GFX8-NEXT: s_endpgm 2131; 2132; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2133; GFX9: ; %bb.0: 2134; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2135; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2136; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2137; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2138; GFX9-NEXT: s_waitcnt vmcnt(0) 2139; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0 2140; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2141; GFX9-NEXT: s_endpgm 2142; 2143; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: 2144; GFX11: ; %bb.0: 2145; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2146; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2147; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2149; GFX11-NEXT: s_waitcnt vmcnt(0) 2150; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0 2151; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2152; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2153; GFX11-NEXT: s_endpgm 2154 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2155 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 2156 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2157 %a = load float, float addrspace(1)* %gep0 2158 %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0) 2159 store float %med, float addrspace(1)* %out.gep 2160 ret void 2161} 2162 2163define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 2164; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2165; GFX6: ; %bb.0: 2166; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2167; GFX6-NEXT: s_mov_b32 s7, 0xf000 2168; GFX6-NEXT: s_mov_b32 s6, 0 2169; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2170; GFX6-NEXT: v_mov_b32_e32 v1, 0 2171; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2172; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2173; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2174; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2175; GFX6-NEXT: s_waitcnt vmcnt(0) 2176; GFX6-NEXT: v_med3_f32 v2, 0, v2, 1.0 2177; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2178; GFX6-NEXT: s_endpgm 2179; 2180; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2181; GFX8: ; %bb.0: 2182; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2183; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2185; GFX8-NEXT: v_mov_b32_e32 v1, s3 2186; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2187; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2188; GFX8-NEXT: flat_load_dword v3, v[0:1] 2189; GFX8-NEXT: v_mov_b32_e32 v1, s1 2190; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2191; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2192; GFX8-NEXT: s_waitcnt vmcnt(0) 2193; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0 2194; GFX8-NEXT: flat_store_dword v[0:1], v2 2195; GFX8-NEXT: s_endpgm 2196; 2197; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2198; GFX9: ; %bb.0: 2199; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2200; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2201; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2202; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2203; GFX9-NEXT: s_waitcnt vmcnt(0) 2204; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0 2205; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2206; GFX9-NEXT: s_endpgm 2207; 2208; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: 2209; GFX11: ; %bb.0: 2210; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2211; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2212; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2214; GFX11-NEXT: s_waitcnt vmcnt(0) 2215; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0 2216; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2217; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2218; GFX11-NEXT: s_endpgm 2219 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2220 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 2221 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2222 %a = load float, float addrspace(1)* %gep0 2223 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0) 2224 store float %med, float addrspace(1)* %out.gep 2225 ret void 2226} 2227 2228define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 { 2229; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2230; GFX6: ; %bb.0: 2231; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2232; GFX6-NEXT: s_mov_b32 s7, 0xf000 2233; GFX6-NEXT: s_mov_b32 s6, 0 2234; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2235; GFX6-NEXT: v_mov_b32_e32 v1, 0 2236; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2237; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2238; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2239; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2240; GFX6-NEXT: s_waitcnt vmcnt(0) 2241; GFX6-NEXT: v_med3_f32 v2, 1.0, v2, 0 2242; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2243; GFX6-NEXT: s_endpgm 2244; 2245; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2246; GFX8: ; %bb.0: 2247; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2248; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2249; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2250; GFX8-NEXT: v_mov_b32_e32 v1, s3 2251; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2252; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2253; GFX8-NEXT: flat_load_dword v3, v[0:1] 2254; GFX8-NEXT: v_mov_b32_e32 v1, s1 2255; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2256; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2257; GFX8-NEXT: s_waitcnt vmcnt(0) 2258; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0 2259; GFX8-NEXT: flat_store_dword v[0:1], v2 2260; GFX8-NEXT: s_endpgm 2261; 2262; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2263; GFX9: ; %bb.0: 2264; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2265; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2267; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2268; GFX9-NEXT: s_waitcnt vmcnt(0) 2269; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0 2270; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2271; GFX9-NEXT: s_endpgm 2272; 2273; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: 2274; GFX11: ; %bb.0: 2275; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2276; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2277; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2278; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2279; GFX11-NEXT: s_waitcnt vmcnt(0) 2280; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0 2281; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2282; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2283; GFX11-NEXT: s_endpgm 2284 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2285 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 2286 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2287 %a = load float, float addrspace(1)* %gep0 2288 %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0) 2289 store float %med, float addrspace(1)* %out.gep 2290 ret void 2291} 2292 2293define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 2294; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2295; GFX6: ; %bb.0: 2296; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2297; GFX6-NEXT: s_mov_b32 s3, 0xf000 2298; GFX6-NEXT: s_mov_b32 s2, 0 2299; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2300; GFX6-NEXT: v_mov_b32_e32 v1, 0 2301; GFX6-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2302; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2303; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2304; GFX6-NEXT: s_endpgm 2305; 2306; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2307; GFX8: ; %bb.0: 2308; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2309; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2310; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2311; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2312; GFX8-NEXT: v_mov_b32_e32 v1, s1 2313; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2314; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2315; GFX8-NEXT: flat_store_dword v[0:1], v2 2316; GFX8-NEXT: s_endpgm 2317; 2318; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2319; GFX9: ; %bb.0: 2320; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2321; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2322; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 2323; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2324; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2325; GFX9-NEXT: s_endpgm 2326; 2327; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: 2328; GFX11: ; %bb.0: 2329; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2330; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 2331; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2332; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2333; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2334; GFX11-NEXT: s_endpgm 2335 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2336 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2337 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000) 2338 store float %med, float addrspace(1)* %out.gep 2339 ret void 2340} 2341 2342define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 { 2343; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2344; GFX6: ; %bb.0: 2345; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2346; GFX6-NEXT: s_mov_b32 s3, 0xf000 2347; GFX6-NEXT: s_mov_b32 s2, 0 2348; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2349; GFX6-NEXT: v_mov_b32_e32 v1, 0 2350; GFX6-NEXT: v_mov_b32_e32 v2, 0x7f800001 2351; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2352; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2353; GFX6-NEXT: s_endpgm 2354; 2355; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2356; GFX8: ; %bb.0: 2357; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2358; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2359; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 2360; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX8-NEXT: v_mov_b32_e32 v1, s1 2362; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2363; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2364; GFX8-NEXT: flat_store_dword v[0:1], v2 2365; GFX8-NEXT: s_endpgm 2366; 2367; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2368; GFX9: ; %bb.0: 2369; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2370; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2371; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 2372; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2373; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2374; GFX9-NEXT: s_endpgm 2375; 2376; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: 2377; GFX11: ; %bb.0: 2378; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2379; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 2380; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2381; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2382; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2383; GFX11-NEXT: s_endpgm 2384 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2385 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid 2386 %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float)) 2387 store float %med, float addrspace(1)* %out.gep 2388 ret void 2389} 2390 2391define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2392; GFX6-LABEL: v_clamp_v2f16: 2393; GFX6: ; %bb.0: 2394; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2395; GFX6-NEXT: s_mov_b32 s7, 0xf000 2396; GFX6-NEXT: s_mov_b32 s6, 0 2397; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2398; GFX6-NEXT: v_mov_b32_e32 v1, 0 2399; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2401; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2402; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2403; GFX6-NEXT: s_waitcnt vmcnt(0) 2404; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2405; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2406; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2407; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2408; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2409; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2410; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2411; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2412; GFX6-NEXT: s_endpgm 2413; 2414; GFX8-LABEL: v_clamp_v2f16: 2415; GFX8: ; %bb.0: 2416; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2417; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2418; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2419; GFX8-NEXT: v_mov_b32_e32 v1, s3 2420; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2421; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2422; GFX8-NEXT: flat_load_dword v3, v[0:1] 2423; GFX8-NEXT: v_mov_b32_e32 v1, s1 2424; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2425; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2426; GFX8-NEXT: s_waitcnt vmcnt(0) 2427; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2428; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp 2429; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2430; GFX8-NEXT: flat_store_dword v[0:1], v2 2431; GFX8-NEXT: s_endpgm 2432; 2433; GFX9-LABEL: v_clamp_v2f16: 2434; GFX9: ; %bb.0: 2435; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2436; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2438; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2439; GFX9-NEXT: s_waitcnt vmcnt(0) 2440; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2441; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2442; GFX9-NEXT: s_endpgm 2443; 2444; GFX11-LABEL: v_clamp_v2f16: 2445; GFX11: ; %bb.0: 2446; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2447; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2448; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2449; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2450; GFX11-NEXT: s_waitcnt vmcnt(0) 2451; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2452; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2453; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2454; GFX11-NEXT: s_endpgm 2455 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2456 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2457 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2458 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2459 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer) 2460 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2461 2462 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2463 ret void 2464} 2465 2466define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2467; GFX6-LABEL: v_clamp_v2f16_undef_elt: 2468; GFX6: ; %bb.0: 2469; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2470; GFX6-NEXT: s_mov_b32 s7, 0xf000 2471; GFX6-NEXT: s_mov_b32 s6, 0 2472; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2473; GFX6-NEXT: v_mov_b32_e32 v1, 0 2474; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2475; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2476; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2477; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 2478; GFX6-NEXT: s_waitcnt vmcnt(0) 2479; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 2480; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2481; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 2482; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 2483; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 2484; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 2485; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2 2486; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2487; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 2488; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2489; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2490; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2491; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 2492; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2493; GFX6-NEXT: s_endpgm 2494; 2495; GFX8-LABEL: v_clamp_v2f16_undef_elt: 2496; GFX8: ; %bb.0: 2497; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2498; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2499; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 2500; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2501; GFX8-NEXT: v_mov_b32_e32 v1, s3 2502; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2503; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2504; GFX8-NEXT: flat_load_dword v3, v[0:1] 2505; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2506; GFX8-NEXT: v_mov_b32_e32 v1, s1 2507; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2508; GFX8-NEXT: s_waitcnt vmcnt(0) 2509; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2510; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 2511; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 2512; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 2513; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 2514; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2515; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2516; GFX8-NEXT: flat_store_dword v[0:1], v2 2517; GFX8-NEXT: s_endpgm 2518; 2519; GFX9-LABEL: v_clamp_v2f16_undef_elt: 2520; GFX9: ; %bb.0: 2521; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2522; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2523; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2524; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2525; GFX9-NEXT: s_waitcnt vmcnt(0) 2526; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2527; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2528; GFX9-NEXT: s_endpgm 2529; 2530; GFX11-LABEL: v_clamp_v2f16_undef_elt: 2531; GFX11: ; %bb.0: 2532; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2533; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2534; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2535; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2536; GFX11-NEXT: s_waitcnt vmcnt(0) 2537; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 2538; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2539; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2540; GFX11-NEXT: s_endpgm 2541 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2542 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2543 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2544 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2545 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 2546 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 2547 2548 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2549 ret void 2550} 2551 2552define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2553; GFX6-LABEL: v_clamp_v2f16_not_zero: 2554; GFX6: ; %bb.0: 2555; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2556; GFX6-NEXT: s_mov_b32 s7, 0xf000 2557; GFX6-NEXT: s_mov_b32 s6, 0 2558; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2559; GFX6-NEXT: v_mov_b32_e32 v1, 0 2560; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2561; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2562; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2563; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2564; GFX6-NEXT: s_waitcnt vmcnt(0) 2565; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 2566; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2567; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2568; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 2569; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3 2570; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2571; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 2572; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2573; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2574; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 2575; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2576; GFX6-NEXT: s_endpgm 2577; 2578; GFX8-LABEL: v_clamp_v2f16_not_zero: 2579; GFX8: ; %bb.0: 2580; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2581; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2582; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2583; GFX8-NEXT: v_mov_b32_e32 v1, s3 2584; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2585; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2586; GFX8-NEXT: flat_load_dword v3, v[0:1] 2587; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2588; GFX8-NEXT: v_mov_b32_e32 v1, s1 2589; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2590; GFX8-NEXT: s_waitcnt vmcnt(0) 2591; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 2592; GFX8-NEXT: v_max_f16_e32 v2, 2.0, v2 2593; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2594; GFX8-NEXT: v_min_f16_e32 v2, 1.0, v2 2595; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 2596; GFX8-NEXT: flat_store_dword v[0:1], v2 2597; GFX8-NEXT: s_endpgm 2598; 2599; GFX9-LABEL: v_clamp_v2f16_not_zero: 2600; GFX9: ; %bb.0: 2601; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2602; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2604; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2605; GFX9-NEXT: s_waitcnt vmcnt(0) 2606; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2607; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0 2608; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] 2609; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2610; GFX9-NEXT: s_endpgm 2611; 2612; GFX11-LABEL: v_clamp_v2f16_not_zero: 2613; GFX11: ; %bb.0: 2614; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2615; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2616; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2617; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2618; GFX11-NEXT: s_waitcnt vmcnt(0) 2619; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2620; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2621; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 2622; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] 2623; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2624; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2625; GFX11-NEXT: s_endpgm 2626 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2627 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2628 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2629 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2630 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>) 2631 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2632 2633 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2634 ret void 2635} 2636 2637define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2638; GFX6-LABEL: v_clamp_v2f16_not_one: 2639; GFX6: ; %bb.0: 2640; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2641; GFX6-NEXT: s_mov_b32 s7, 0xf000 2642; GFX6-NEXT: s_mov_b32 s6, 0 2643; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2644; GFX6-NEXT: v_mov_b32_e32 v1, 0 2645; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2646; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2647; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2648; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2649; GFX6-NEXT: s_waitcnt vmcnt(0) 2650; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2651; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 2652; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2653; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 2654; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2655; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 2656; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2657; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2658; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2659; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2660; GFX6-NEXT: s_endpgm 2661; 2662; GFX8-LABEL: v_clamp_v2f16_not_one: 2663; GFX8: ; %bb.0: 2664; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2665; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2666; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2667; GFX8-NEXT: v_mov_b32_e32 v1, s3 2668; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2669; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2670; GFX8-NEXT: flat_load_dword v3, v[0:1] 2671; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2672; GFX8-NEXT: v_mov_b32_e32 v1, s1 2673; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2674; GFX8-NEXT: s_waitcnt vmcnt(0) 2675; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 2676; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 2677; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2678; GFX8-NEXT: v_min_f16_e32 v2, 0, v2 2679; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 2680; GFX8-NEXT: flat_store_dword v[0:1], v2 2681; GFX8-NEXT: s_endpgm 2682; 2683; GFX9-LABEL: v_clamp_v2f16_not_one: 2684; GFX9: ; %bb.0: 2685; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2686; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2687; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2688; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2689; GFX9-NEXT: s_waitcnt vmcnt(0) 2690; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2691; GFX9-NEXT: v_pk_max_f16 v1, v1, 0 2692; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] 2693; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2694; GFX9-NEXT: s_endpgm 2695; 2696; GFX11-LABEL: v_clamp_v2f16_not_one: 2697; GFX11: ; %bb.0: 2698; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2699; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2700; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2701; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2702; GFX11-NEXT: s_waitcnt vmcnt(0) 2703; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2705; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 2706; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] 2707; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2708; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2709; GFX11-NEXT: s_endpgm 2710 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2711 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2712 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2713 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2714 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>) 2715 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>) 2716 2717 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2718 ret void 2719} 2720 2721define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2722; GFX6-LABEL: v_clamp_neg_v2f16: 2723; GFX6: ; %bb.0: 2724; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2725; GFX6-NEXT: s_mov_b32 s7, 0xf000 2726; GFX6-NEXT: s_mov_b32 s6, 0 2727; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2728; GFX6-NEXT: v_mov_b32_e32 v1, 0 2729; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2730; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2731; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2732; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2733; GFX6-NEXT: s_waitcnt vmcnt(0) 2734; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 2735; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2736; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2737; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2738; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2739; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2740; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2741; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2742; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2743; GFX6-NEXT: s_endpgm 2744; 2745; GFX8-LABEL: v_clamp_neg_v2f16: 2746; GFX8: ; %bb.0: 2747; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2748; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2749; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2750; GFX8-NEXT: v_mov_b32_e32 v1, s3 2751; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2752; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2753; GFX8-NEXT: flat_load_dword v3, v[0:1] 2754; GFX8-NEXT: v_mov_b32_e32 v1, s1 2755; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2756; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2757; GFX8-NEXT: s_waitcnt vmcnt(0) 2758; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2759; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp 2760; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2761; GFX8-NEXT: flat_store_dword v[0:1], v2 2762; GFX8-NEXT: s_endpgm 2763; 2764; GFX9-LABEL: v_clamp_neg_v2f16: 2765; GFX9: ; %bb.0: 2766; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2767; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2768; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2769; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2770; GFX9-NEXT: s_waitcnt vmcnt(0) 2771; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 2772; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2773; GFX9-NEXT: s_endpgm 2774; 2775; GFX11-LABEL: v_clamp_neg_v2f16: 2776; GFX11: ; %bb.0: 2777; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2778; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2779; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2780; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2781; GFX11-NEXT: s_waitcnt vmcnt(0) 2782; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 2783; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2784; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2785; GFX11-NEXT: s_endpgm 2786 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2787 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2788 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2789 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2790 %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a 2791 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer) 2792 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2793 2794 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2795 ret void 2796} 2797 2798define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2799; GFX6-LABEL: v_clamp_negabs_v2f16: 2800; GFX6: ; %bb.0: 2801; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2802; GFX6-NEXT: s_mov_b32 s7, 0xf000 2803; GFX6-NEXT: s_mov_b32 s6, 0 2804; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2805; GFX6-NEXT: v_mov_b32_e32 v1, 0 2806; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2807; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2808; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2809; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2810; GFX6-NEXT: s_waitcnt vmcnt(0) 2811; GFX6-NEXT: v_or_b32_e32 v2, 0x80008000, v2 2812; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2813; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2814; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2815; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2816; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2817; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2818; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2819; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2820; GFX6-NEXT: s_endpgm 2821; 2822; GFX8-LABEL: v_clamp_negabs_v2f16: 2823; GFX8: ; %bb.0: 2824; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2825; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2826; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX8-NEXT: v_mov_b32_e32 v1, s3 2828; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2829; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2830; GFX8-NEXT: flat_load_dword v3, v[0:1] 2831; GFX8-NEXT: v_mov_b32_e32 v1, s1 2832; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2833; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2834; GFX8-NEXT: s_waitcnt vmcnt(0) 2835; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2836; GFX8-NEXT: v_max_f16_e64 v3, -|v3|, -|v3| clamp 2837; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2838; GFX8-NEXT: flat_store_dword v[0:1], v2 2839; GFX8-NEXT: s_endpgm 2840; 2841; GFX9-LABEL: v_clamp_negabs_v2f16: 2842; GFX9: ; %bb.0: 2843; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2844; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2845; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2846; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2847; GFX9-NEXT: s_waitcnt vmcnt(0) 2848; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 2849; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 2850; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2851; GFX9-NEXT: s_endpgm 2852; 2853; GFX11-LABEL: v_clamp_negabs_v2f16: 2854; GFX11: ; %bb.0: 2855; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2856; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2857; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2858; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2859; GFX11-NEXT: s_waitcnt vmcnt(0) 2860; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 2861; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2862; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp 2863; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2864; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2865; GFX11-NEXT: s_endpgm 2866 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2867 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2868 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2869 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2870 %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) 2871 %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a 2872 2873 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer) 2874 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2875 2876 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2877 ret void 2878} 2879 2880define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2881; GFX6-LABEL: v_clamp_neglo_v2f16: 2882; GFX6: ; %bb.0: 2883; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2884; GFX6-NEXT: s_mov_b32 s7, 0xf000 2885; GFX6-NEXT: s_mov_b32 s6, 0 2886; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2887; GFX6-NEXT: v_mov_b32_e32 v1, 0 2888; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2889; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2890; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2891; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2892; GFX6-NEXT: s_waitcnt vmcnt(0) 2893; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2894; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 2895; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 2896; GFX6-NEXT: v_cvt_f32_f16_e64 v2, -v2 clamp 2897; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2898; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2899; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2900; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2901; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2902; GFX6-NEXT: s_endpgm 2903; 2904; GFX8-LABEL: v_clamp_neglo_v2f16: 2905; GFX8: ; %bb.0: 2906; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2907; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2908; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2909; GFX8-NEXT: v_mov_b32_e32 v1, s3 2910; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2911; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2912; GFX8-NEXT: flat_load_dword v3, v[0:1] 2913; GFX8-NEXT: v_mov_b32_e32 v1, s1 2914; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2915; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2916; GFX8-NEXT: s_waitcnt vmcnt(0) 2917; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2918; GFX8-NEXT: v_max_f16_e64 v3, -v3, -v3 clamp 2919; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2920; GFX8-NEXT: flat_store_dword v[0:1], v2 2921; GFX8-NEXT: s_endpgm 2922; 2923; GFX9-LABEL: v_clamp_neglo_v2f16: 2924; GFX9: ; %bb.0: 2925; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2926; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2928; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2929; GFX9-NEXT: s_waitcnt vmcnt(0) 2930; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 2931; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2932; GFX9-NEXT: s_endpgm 2933; 2934; GFX11-LABEL: v_clamp_neglo_v2f16: 2935; GFX11: ; %bb.0: 2936; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2937; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2938; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2939; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2940; GFX11-NEXT: s_waitcnt vmcnt(0) 2941; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp 2942; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2943; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2944; GFX11-NEXT: s_endpgm 2945 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2946 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 2947 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 2948 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 2949 %lo = extractelement <2 x half> %a, i32 0 2950 %neg.lo = fsub half -0.0, %lo 2951 %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0 2952 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer) 2953 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 2954 2955 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 2956 ret void 2957} 2958 2959define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 2960; GFX6-LABEL: v_clamp_neghi_v2f16: 2961; GFX6: ; %bb.0: 2962; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2963; GFX6-NEXT: s_mov_b32 s7, 0xf000 2964; GFX6-NEXT: s_mov_b32 s6, 0 2965; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2966; GFX6-NEXT: v_mov_b32_e32 v1, 0 2967; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2968; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 2969; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2970; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 2971; GFX6-NEXT: s_waitcnt vmcnt(0) 2972; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2973; GFX6-NEXT: v_cvt_f32_f16_e64 v3, -v3 clamp 2974; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 2975; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 2976; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 2977; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2978; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 2979; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2980; GFX6-NEXT: s_endpgm 2981; 2982; GFX8-LABEL: v_clamp_neghi_v2f16: 2983; GFX8: ; %bb.0: 2984; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2985; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2986; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX8-NEXT: v_mov_b32_e32 v1, s3 2988; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2989; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2990; GFX8-NEXT: flat_load_dword v3, v[0:1] 2991; GFX8-NEXT: v_mov_b32_e32 v1, s1 2992; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2993; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2994; GFX8-NEXT: s_waitcnt vmcnt(0) 2995; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2996; GFX8-NEXT: v_max_f16_e64 v3, v3, v3 clamp 2997; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 2998; GFX8-NEXT: flat_store_dword v[0:1], v2 2999; GFX8-NEXT: s_endpgm 3000; 3001; GFX9-LABEL: v_clamp_neghi_v2f16: 3002; GFX9: ; %bb.0: 3003; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3004; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3005; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3006; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3007; GFX9-NEXT: s_waitcnt vmcnt(0) 3008; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 3009; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3010; GFX9-NEXT: s_endpgm 3011; 3012; GFX11-LABEL: v_clamp_neghi_v2f16: 3013; GFX11: ; %bb.0: 3014; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3015; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3016; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3017; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3018; GFX11-NEXT: s_waitcnt vmcnt(0) 3019; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp 3020; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3021; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3022; GFX11-NEXT: s_endpgm 3023 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3024 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 3025 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 3026 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 3027 %hi = extractelement <2 x half> %a, i32 1 3028 %neg.hi = fsub half -0.0, %hi 3029 %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1 3030 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer) 3031 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3032 3033 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 3034 ret void 3035} 3036 3037define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 3038; GFX6-LABEL: v_clamp_v2f16_shuffle: 3039; GFX6: ; %bb.0: 3040; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3041; GFX6-NEXT: s_mov_b32 s7, 0xf000 3042; GFX6-NEXT: s_mov_b32 s6, 0 3043; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3044; GFX6-NEXT: v_mov_b32_e32 v1, 0 3045; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3046; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3047; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3048; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3049; GFX6-NEXT: s_waitcnt vmcnt(0) 3050; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3051; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp 3052; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp 3053; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3054; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3055; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3056; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 3057; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3058; GFX6-NEXT: s_endpgm 3059; 3060; GFX8-LABEL: v_clamp_v2f16_shuffle: 3061; GFX8: ; %bb.0: 3062; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3063; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3064; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3065; GFX8-NEXT: v_mov_b32_e32 v1, s3 3066; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3067; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3068; GFX8-NEXT: flat_load_dword v3, v[0:1] 3069; GFX8-NEXT: v_mov_b32_e32 v1, s1 3070; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3071; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3072; GFX8-NEXT: s_waitcnt vmcnt(0) 3073; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3074; GFX8-NEXT: v_max_f16_sdwa v3, v3, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3075; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3076; GFX8-NEXT: flat_store_dword v[0:1], v2 3077; GFX8-NEXT: s_endpgm 3078; 3079; GFX9-LABEL: v_clamp_v2f16_shuffle: 3080; GFX9: ; %bb.0: 3081; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3082; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3083; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3084; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3085; GFX9-NEXT: s_waitcnt vmcnt(0) 3086; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 3087; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3088; GFX9-NEXT: s_endpgm 3089; 3090; GFX11-LABEL: v_clamp_v2f16_shuffle: 3091; GFX11: ; %bb.0: 3092; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3093; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3094; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3095; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3096; GFX11-NEXT: s_waitcnt vmcnt(0) 3097; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp 3098; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3099; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3100; GFX11-NEXT: s_endpgm 3101 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3102 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 3103 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 3104 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 3105 %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0> 3106 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer) 3107 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>) 3108 3109 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 3110 ret void 3111} 3112 3113define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 3114; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: 3115; GFX6: ; %bb.0: 3116; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3117; GFX6-NEXT: s_mov_b32 s7, 0xf000 3118; GFX6-NEXT: s_mov_b32 s6, 0 3119; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3120; GFX6-NEXT: v_mov_b32_e32 v1, 0 3121; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3122; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3123; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3124; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 3125; GFX6-NEXT: s_waitcnt vmcnt(0) 3126; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 3127; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 3128; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3129; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 3130; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 3131; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0 3132; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3133; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2 3134; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3135; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3136; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3137; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3138; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3139; GFX6-NEXT: s_endpgm 3140; 3141; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: 3142; GFX8: ; %bb.0: 3143; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3144; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3145; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 3146; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3147; GFX8-NEXT: v_mov_b32_e32 v1, s3 3148; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3149; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3150; GFX8-NEXT: flat_load_dword v3, v[0:1] 3151; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3152; GFX8-NEXT: v_mov_b32_e32 v1, s1 3153; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3154; GFX8-NEXT: s_waitcnt vmcnt(0) 3155; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3156; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 3157; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2 3158; GFX8-NEXT: v_max_f16_e32 v3, 0, v3 3159; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3 3160; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3161; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3162; GFX8-NEXT: flat_store_dword v[0:1], v2 3163; GFX8-NEXT: s_endpgm 3164; 3165; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: 3166; GFX9: ; %bb.0: 3167; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3168; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3170; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3171; GFX9-NEXT: s_waitcnt vmcnt(0) 3172; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3173; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3174; GFX9-NEXT: s_endpgm 3175; 3176; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: 3177; GFX11: ; %bb.0: 3178; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3179; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3180; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3181; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3182; GFX11-NEXT: s_waitcnt vmcnt(0) 3183; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3184; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3185; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3186; GFX11-NEXT: s_endpgm 3187 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3188 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 3189 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 3190 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 3191 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>) 3192 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>) 3193 3194 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 3195 ret void 3196} 3197 3198define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { 3199; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: 3200; GFX6: ; %bb.0: 3201; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3202; GFX6-NEXT: s_mov_b32 s7, 0xf000 3203; GFX6-NEXT: s_mov_b32 s6, 0 3204; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3205; GFX6-NEXT: v_mov_b32_e32 v1, 0 3206; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3207; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 3208; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3209; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 3210; GFX6-NEXT: s_waitcnt vmcnt(0) 3211; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 3212; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 3213; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 3214; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 3215; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 3216; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 3217; GFX6-NEXT: v_med3_f32 v2, v2, 0, s2 3218; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 3219; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 3220; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 3221; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] 3222; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3223; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 3224; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 3225; GFX6-NEXT: s_endpgm 3226; 3227; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: 3228; GFX8: ; %bb.0: 3229; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3230; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3231; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 3232; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3233; GFX8-NEXT: v_mov_b32_e32 v1, s3 3234; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3235; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3236; GFX8-NEXT: flat_load_dword v3, v[0:1] 3237; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3238; GFX8-NEXT: v_mov_b32_e32 v1, s1 3239; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3240; GFX8-NEXT: s_waitcnt vmcnt(0) 3241; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3242; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 3243; GFX8-NEXT: v_max_f16_e32 v2, 0, v2 3244; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3 3245; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3 3246; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3247; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3248; GFX8-NEXT: flat_store_dword v[0:1], v2 3249; GFX8-NEXT: s_endpgm 3250; 3251; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: 3252; GFX9: ; %bb.0: 3253; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3254; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3256; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 3257; GFX9-NEXT: s_waitcnt vmcnt(0) 3258; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3259; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3260; GFX9-NEXT: s_endpgm 3261; 3262; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: 3263; GFX11: ; %bb.0: 3264; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3265; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3266; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3267; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 3268; GFX11-NEXT: s_waitcnt vmcnt(0) 3269; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp 3270; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3271; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3272; GFX11-NEXT: s_endpgm 3273 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3274 %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid 3275 %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid 3276 %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 3277 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>) 3278 %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>) 3279 3280 store <2 x half> %med, <2 x half> addrspace(1)* %out.gep 3281 ret void 3282} 3283 3284define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 3285; GFX6-LABEL: v_clamp_diff_source_f32: 3286; GFX6: ; %bb.0: 3287; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3288; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3289; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3290; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 3291; GFX6-NEXT: s_mov_b32 s3, 0xf000 3292; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3293; GFX6-NEXT: v_mov_b32_e32 v0, s5 3294; GFX6-NEXT: v_mov_b32_e32 v1, s2 3295; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 3296; GFX6-NEXT: v_add_f32_e32 v1, s4, v1 3297; GFX6-NEXT: v_max_f32_e64 v0, v0, v1 clamp 3298; GFX6-NEXT: s_mov_b32 s2, -1 3299; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 3300; GFX6-NEXT: s_endpgm 3301; 3302; GFX8-LABEL: v_clamp_diff_source_f32: 3303; GFX8: ; %bb.0: 3304; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3306; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3307; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 3308; GFX8-NEXT: s_add_u32 s0, s0, 12 3309; GFX8-NEXT: s_addc_u32 s1, s1, 0 3310; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3311; GFX8-NEXT: v_mov_b32_e32 v0, s5 3312; GFX8-NEXT: v_mov_b32_e32 v1, s2 3313; GFX8-NEXT: v_add_f32_e32 v0, s4, v0 3314; GFX8-NEXT: v_add_f32_e32 v1, s4, v1 3315; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp 3316; GFX8-NEXT: v_mov_b32_e32 v0, s0 3317; GFX8-NEXT: v_mov_b32_e32 v1, s1 3318; GFX8-NEXT: flat_store_dword v[0:1], v2 3319; GFX8-NEXT: s_endpgm 3320; 3321; GFX9-LABEL: v_clamp_diff_source_f32: 3322; GFX9: ; %bb.0: 3323; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3324; GFX9-NEXT: v_mov_b32_e32 v0, 0 3325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3326; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 3327; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8 3328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3329; GFX9-NEXT: v_mov_b32_e32 v1, s5 3330; GFX9-NEXT: v_mov_b32_e32 v2, s6 3331; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 3332; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 3333; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp 3334; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12 3335; GFX9-NEXT: s_endpgm 3336; 3337; GFX11-LABEL: v_clamp_diff_source_f32: 3338; GFX11: ; %bb.0: 3339; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3340; GFX11-NEXT: v_mov_b32_e32 v2, 0 3341; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3342; GFX11-NEXT: s_clause 0x1 3343; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 3344; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 3345; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3346; GFX11-NEXT: v_add_f32_e64 v0, s4, s5 3347; GFX11-NEXT: v_add_f32_e64 v1, s4, s2 3348; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3349; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp 3350; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12 3351; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3352; GFX11-NEXT: s_endpgm 3353{ 3354 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0 3355 %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1 3356 %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2 3357 %l0 = load float, float addrspace(1)* %gep0 3358 %l1 = load float, float addrspace(1)* %gep1 3359 %l2 = load float, float addrspace(1)* %gep2 3360 %a = fadd nsz float %l0, %l1 3361 %b = fadd nsz float %l0, %l2 3362 %res = call nsz float @llvm.maxnum.f32(float %a, float %b) 3363 %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0) 3364 %min = call nsz float @llvm.minnum.f32(float %max, float 1.0) 3365 %out.gep = getelementptr float, float addrspace(1)* %out, i32 3 3366 store float %min, float addrspace(1)* %out.gep 3367 ret void 3368} 3369 3370declare i32 @llvm.amdgcn.workitem.id.x() #1 3371declare float @llvm.fabs.f32(float) #1 3372declare float @llvm.minnum.f32(float, float) #1 3373declare float @llvm.maxnum.f32(float, float) #1 3374declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1 3375declare double @llvm.fabs.f64(double) #1 3376declare double @llvm.minnum.f64(double, double) #1 3377declare double @llvm.maxnum.f64(double, double) #1 3378declare half @llvm.fabs.f16(half) #1 3379declare half @llvm.minnum.f16(half, half) #1 3380declare half @llvm.maxnum.f16(half, half) #1 3381declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 3382declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 3383declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 3384 3385attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3386attributes #1 = { nounwind readnone } 3387attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 3388attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 3389attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } 3390