1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 5 6; DAGCombiner will transform: 7; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) 8; unless isFabsFree returns true 9 10define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { 11; CI-LABEL: s_fabs_free_f16: 12; CI: ; %bb.0: 13; CI-NEXT: s_load_dword s2, s[4:5], 0x2 14; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 15; CI-NEXT: s_waitcnt lgkmcnt(0) 16; CI-NEXT: s_and_b32 s2, s2, 0x7fff 17; CI-NEXT: v_mov_b32_e32 v0, s0 18; CI-NEXT: v_mov_b32_e32 v1, s1 19; CI-NEXT: v_mov_b32_e32 v2, s2 20; CI-NEXT: flat_store_short v[0:1], v2 21; CI-NEXT: s_endpgm 22; 23; VI-LABEL: s_fabs_free_f16: 24; VI: ; %bb.0: 25; VI-NEXT: s_load_dword s2, s[4:5], 0x8 26; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 27; VI-NEXT: s_waitcnt lgkmcnt(0) 28; VI-NEXT: s_and_b32 s2, s2, 0x7fff 29; VI-NEXT: v_mov_b32_e32 v0, s0 30; VI-NEXT: v_mov_b32_e32 v1, s1 31; VI-NEXT: v_mov_b32_e32 v2, s2 32; VI-NEXT: flat_store_short v[0:1], v2 33; VI-NEXT: s_endpgm 34; 35; GFX9-LABEL: s_fabs_free_f16: 36; GFX9: ; %bb.0: 37; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 38; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 39; GFX9-NEXT: v_mov_b32_e32 v0, 0 40; GFX9-NEXT: s_waitcnt lgkmcnt(0) 41; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 42; GFX9-NEXT: v_mov_b32_e32 v1, s2 43; GFX9-NEXT: global_store_short v0, v1, s[0:1] 44; GFX9-NEXT: s_endpgm 45 %bc= bitcast i16 %in to half 46 %fabs = call half @llvm.fabs.f16(half %bc) 47 store half %fabs, half addrspace(1)* %out 48 ret void 49} 50 51define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { 52; CI-LABEL: s_fabs_f16: 53; CI: ; %bb.0: 54; CI-NEXT: s_load_dword s2, s[4:5], 0x2 55; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 56; CI-NEXT: s_waitcnt lgkmcnt(0) 57; CI-NEXT: s_and_b32 s2, s2, 0x7fff 58; CI-NEXT: v_mov_b32_e32 v0, s0 59; CI-NEXT: v_mov_b32_e32 v1, s1 60; CI-NEXT: v_mov_b32_e32 v2, s2 61; CI-NEXT: flat_store_short v[0:1], v2 62; CI-NEXT: s_endpgm 63; 64; VI-LABEL: s_fabs_f16: 65; VI: ; %bb.0: 66; VI-NEXT: s_load_dword s2, s[4:5], 0x8 67; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 68; VI-NEXT: s_waitcnt lgkmcnt(0) 69; VI-NEXT: s_and_b32 s2, s2, 0x7fff 70; VI-NEXT: v_mov_b32_e32 v0, s0 71; VI-NEXT: v_mov_b32_e32 v1, s1 72; VI-NEXT: v_mov_b32_e32 v2, s2 73; VI-NEXT: flat_store_short v[0:1], v2 74; VI-NEXT: s_endpgm 75; 76; GFX9-LABEL: s_fabs_f16: 77; GFX9: ; %bb.0: 78; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 80; GFX9-NEXT: v_mov_b32_e32 v0, 0 81; GFX9-NEXT: s_waitcnt lgkmcnt(0) 82; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 83; GFX9-NEXT: v_mov_b32_e32 v1, s2 84; GFX9-NEXT: global_store_short v0, v1, s[0:1] 85; GFX9-NEXT: s_endpgm 86 %fabs = call half @llvm.fabs.f16(half %in) 87 store half %fabs, half addrspace(1)* %out 88 ret void 89} 90 91define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { 92; CI-LABEL: s_fabs_v2f16: 93; CI: ; %bb.0: 94; CI-NEXT: s_load_dword s2, s[4:5], 0x2 95; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 96; CI-NEXT: s_waitcnt lgkmcnt(0) 97; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 98; CI-NEXT: v_mov_b32_e32 v0, s0 99; CI-NEXT: v_mov_b32_e32 v1, s1 100; CI-NEXT: v_mov_b32_e32 v2, s2 101; CI-NEXT: flat_store_dword v[0:1], v2 102; CI-NEXT: s_endpgm 103; 104; VI-LABEL: s_fabs_v2f16: 105; VI: ; %bb.0: 106; VI-NEXT: s_load_dword s2, s[4:5], 0x8 107; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 108; VI-NEXT: s_waitcnt lgkmcnt(0) 109; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 110; VI-NEXT: v_mov_b32_e32 v0, s0 111; VI-NEXT: v_mov_b32_e32 v1, s1 112; VI-NEXT: v_mov_b32_e32 v2, s2 113; VI-NEXT: flat_store_dword v[0:1], v2 114; VI-NEXT: s_endpgm 115; 116; GFX9-LABEL: s_fabs_v2f16: 117; GFX9: ; %bb.0: 118; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 119; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 120; GFX9-NEXT: v_mov_b32_e32 v0, 0 121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 122; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff 123; GFX9-NEXT: v_mov_b32_e32 v1, s2 124; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 125; GFX9-NEXT: s_endpgm 126 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) 127 store <2 x half> %fabs, <2 x half> addrspace(1)* %out 128 ret void 129} 130 131define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { 132; CI-LABEL: s_fabs_v4f16: 133; CI: ; %bb.0: 134; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 135; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 136; CI-NEXT: s_waitcnt lgkmcnt(0) 137; CI-NEXT: s_and_b32 s1, s1, 0x7fff7fff 138; CI-NEXT: s_and_b32 s0, s0, 0x7fff7fff 139; CI-NEXT: v_mov_b32_e32 v2, s2 140; CI-NEXT: v_mov_b32_e32 v0, s0 141; CI-NEXT: v_mov_b32_e32 v1, s1 142; CI-NEXT: v_mov_b32_e32 v3, s3 143; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 144; CI-NEXT: s_endpgm 145; 146; VI-LABEL: s_fabs_v4f16: 147; VI: ; %bb.0: 148; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 149; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 150; VI-NEXT: s_waitcnt lgkmcnt(0) 151; VI-NEXT: s_and_b32 s1, s1, 0x7fff7fff 152; VI-NEXT: s_and_b32 s0, s0, 0x7fff7fff 153; VI-NEXT: v_mov_b32_e32 v2, s2 154; VI-NEXT: v_mov_b32_e32 v0, s0 155; VI-NEXT: v_mov_b32_e32 v1, s1 156; VI-NEXT: v_mov_b32_e32 v3, s3 157; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 158; VI-NEXT: s_endpgm 159; 160; GFX9-LABEL: s_fabs_v4f16: 161; GFX9: ; %bb.0: 162; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 163; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 164; GFX9-NEXT: v_mov_b32_e32 v2, 0 165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 166; GFX9-NEXT: s_and_b32 s1, s1, 0x7fff7fff 167; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff7fff 168; GFX9-NEXT: v_mov_b32_e32 v0, s0 169; GFX9-NEXT: v_mov_b32_e32 v1, s1 170; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 171; GFX9-NEXT: s_endpgm 172 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) 173 store <4 x half> %fabs, <4 x half> addrspace(1)* %out 174 ret void 175} 176 177define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { 178; CI-LABEL: fabs_fold_f16: 179; CI: ; %bb.0: 180; CI-NEXT: s_load_dword s0, s[4:5], 0x2 181; CI-NEXT: s_waitcnt lgkmcnt(0) 182; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| 183; CI-NEXT: s_lshr_b32 s0, s0, 16 184; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 185; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 186; CI-NEXT: v_mul_f32_e32 v0, v0, v1 187; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 188; CI-NEXT: s_waitcnt lgkmcnt(0) 189; CI-NEXT: v_mov_b32_e32 v0, s0 190; CI-NEXT: v_mov_b32_e32 v1, s1 191; CI-NEXT: flat_store_short v[0:1], v2 192; CI-NEXT: s_endpgm 193; 194; VI-LABEL: fabs_fold_f16: 195; VI: ; %bb.0: 196; VI-NEXT: s_load_dword s2, s[4:5], 0x8 197; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 198; VI-NEXT: s_waitcnt lgkmcnt(0) 199; VI-NEXT: s_lshr_b32 s3, s2, 16 200; VI-NEXT: v_mov_b32_e32 v0, s3 201; VI-NEXT: v_mul_f16_e64 v2, |s2|, v0 202; VI-NEXT: v_mov_b32_e32 v0, s0 203; VI-NEXT: v_mov_b32_e32 v1, s1 204; VI-NEXT: flat_store_short v[0:1], v2 205; VI-NEXT: s_endpgm 206; 207; GFX9-LABEL: fabs_fold_f16: 208; GFX9: ; %bb.0: 209; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 210; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 211; GFX9-NEXT: v_mov_b32_e32 v0, 0 212; GFX9-NEXT: s_waitcnt lgkmcnt(0) 213; GFX9-NEXT: s_lshr_b32 s3, s2, 16 214; GFX9-NEXT: v_mov_b32_e32 v1, s3 215; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1 216; GFX9-NEXT: global_store_short v0, v1, s[0:1] 217; GFX9-NEXT: s_endpgm 218 %fabs = call half @llvm.fabs.f16(half %in0) 219 %fmul = fmul half %fabs, %in1 220 store half %fmul, half addrspace(1)* %out 221 ret void 222} 223 224define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 225; CI-LABEL: v_fabs_v2f16: 226; CI: ; %bb.0: 227; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 228; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 229; CI-NEXT: s_waitcnt lgkmcnt(0) 230; CI-NEXT: v_mov_b32_e32 v1, s1 231; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 232; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 233; CI-NEXT: flat_load_dword v2, v[0:1] 234; CI-NEXT: s_waitcnt vmcnt(0) 235; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 236; CI-NEXT: flat_store_dword v[0:1], v2 237; CI-NEXT: s_endpgm 238; 239; VI-LABEL: v_fabs_v2f16: 240; VI: ; %bb.0: 241; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 242; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 243; VI-NEXT: s_waitcnt lgkmcnt(0) 244; VI-NEXT: v_mov_b32_e32 v1, s1 245; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 246; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 247; VI-NEXT: flat_load_dword v2, v[0:1] 248; VI-NEXT: s_waitcnt vmcnt(0) 249; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 250; VI-NEXT: flat_store_dword v[0:1], v2 251; VI-NEXT: s_endpgm 252; 253; GFX9-LABEL: v_fabs_v2f16: 254; GFX9: ; %bb.0: 255; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 256; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 259; GFX9-NEXT: s_waitcnt vmcnt(0) 260; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 261; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 262; GFX9-NEXT: s_endpgm 263 %tid = call i32 @llvm.amdgcn.workitem.id.x() 264 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 265 %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 266 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 267 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 268 store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out 269 ret void 270} 271 272define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { 273; CI-LABEL: fabs_free_v2f16: 274; CI: ; %bb.0: 275; CI-NEXT: s_load_dword s2, s[4:5], 0x2 276; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 277; CI-NEXT: s_waitcnt lgkmcnt(0) 278; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 279; CI-NEXT: v_mov_b32_e32 v0, s0 280; CI-NEXT: v_mov_b32_e32 v1, s1 281; CI-NEXT: v_mov_b32_e32 v2, s2 282; CI-NEXT: flat_store_dword v[0:1], v2 283; CI-NEXT: s_endpgm 284; 285; VI-LABEL: fabs_free_v2f16: 286; VI: ; %bb.0: 287; VI-NEXT: s_load_dword s2, s[4:5], 0x8 288; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 289; VI-NEXT: s_waitcnt lgkmcnt(0) 290; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 291; VI-NEXT: v_mov_b32_e32 v0, s0 292; VI-NEXT: v_mov_b32_e32 v1, s1 293; VI-NEXT: v_mov_b32_e32 v2, s2 294; VI-NEXT: flat_store_dword v[0:1], v2 295; VI-NEXT: s_endpgm 296; 297; GFX9-LABEL: fabs_free_v2f16: 298; GFX9: ; %bb.0: 299; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 300; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 301; GFX9-NEXT: v_mov_b32_e32 v0, 0 302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 303; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff 304; GFX9-NEXT: v_mov_b32_e32 v1, s2 305; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 306; GFX9-NEXT: s_endpgm 307 %bc = bitcast i32 %in to <2 x half> 308 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) 309 store <2 x half> %fabs, <2 x half> addrspace(1)* %out 310 ret void 311} 312 313; FIXME: Should do fabs after conversion to avoid converting multiple 314; times in this particular case. 315define amdgpu_kernel void @v_fabs_fold_self_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 316; CI-LABEL: v_fabs_fold_self_v2f16: 317; CI: ; %bb.0: 318; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 319; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 320; CI-NEXT: s_waitcnt lgkmcnt(0) 321; CI-NEXT: v_mov_b32_e32 v1, s3 322; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 323; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 324; CI-NEXT: flat_load_dword v0, v[0:1] 325; CI-NEXT: s_waitcnt vmcnt(0) 326; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 327; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 328; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 329; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 330; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 331; CI-NEXT: v_mul_f32_e32 v1, v1, v2 332; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 333; CI-NEXT: v_mul_f32_e32 v0, v0, v3 334; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 335; CI-NEXT: v_mov_b32_e32 v0, s0 336; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 337; CI-NEXT: v_mov_b32_e32 v1, s1 338; CI-NEXT: v_or_b32_e32 v2, v3, v2 339; CI-NEXT: flat_store_dword v[0:1], v2 340; CI-NEXT: s_endpgm 341; 342; VI-LABEL: v_fabs_fold_self_v2f16: 343; VI: ; %bb.0: 344; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 345; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 346; VI-NEXT: s_waitcnt lgkmcnt(0) 347; VI-NEXT: v_mov_b32_e32 v1, s3 348; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 349; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 350; VI-NEXT: flat_load_dword v2, v[0:1] 351; VI-NEXT: v_mov_b32_e32 v0, s0 352; VI-NEXT: v_mov_b32_e32 v1, s1 353; VI-NEXT: s_waitcnt vmcnt(0) 354; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 355; VI-NEXT: v_mul_f16_e64 v2, |v2|, v2 356; VI-NEXT: v_or_b32_e32 v2, v2, v3 357; VI-NEXT: flat_store_dword v[0:1], v2 358; VI-NEXT: s_endpgm 359; 360; GFX9-LABEL: v_fabs_fold_self_v2f16: 361; GFX9: ; %bb.0: 362; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 363; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 364; GFX9-NEXT: v_mov_b32_e32 v1, 0 365; GFX9-NEXT: s_waitcnt lgkmcnt(0) 366; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 367; GFX9-NEXT: s_waitcnt vmcnt(0) 368; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0 369; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 370; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 371; GFX9-NEXT: s_endpgm 372 %tid = call i32 @llvm.amdgcn.workitem.id.x() 373 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 374 %val = load <2 x half>, <2 x half> addrspace(1)* %gep 375 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 376 %fmul = fmul <2 x half> %fabs, %val 377 store <2 x half> %fmul, <2 x half> addrspace(1)* %out 378 ret void 379} 380 381define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %other.val) #0 { 382; CI-LABEL: v_fabs_fold_v2f16: 383; CI: ; %bb.0: 384; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 385; CI-NEXT: s_load_dword s4, s[4:5], 0x4 386; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 387; CI-NEXT: s_waitcnt lgkmcnt(0) 388; CI-NEXT: v_mov_b32_e32 v1, s3 389; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 390; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 391; CI-NEXT: flat_load_dword v0, v[0:1] 392; CI-NEXT: s_lshr_b32 s2, s4, 16 393; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 394; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 395; CI-NEXT: s_waitcnt vmcnt(0) 396; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 397; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| 398; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 399; CI-NEXT: v_mul_f32_e32 v1, v2, v1 400; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 401; CI-NEXT: v_mul_f32_e32 v0, v0, v3 402; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 403; CI-NEXT: v_mov_b32_e32 v0, s0 404; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 405; CI-NEXT: v_mov_b32_e32 v1, s1 406; CI-NEXT: v_or_b32_e32 v2, v3, v2 407; CI-NEXT: flat_store_dword v[0:1], v2 408; CI-NEXT: s_endpgm 409; 410; VI-LABEL: v_fabs_fold_v2f16: 411; VI: ; %bb.0: 412; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 413; VI-NEXT: s_load_dword s4, s[4:5], 0x10 414; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 415; VI-NEXT: s_waitcnt lgkmcnt(0) 416; VI-NEXT: v_mov_b32_e32 v1, s3 417; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 418; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 419; VI-NEXT: flat_load_dword v2, v[0:1] 420; VI-NEXT: v_mov_b32_e32 v0, s0 421; VI-NEXT: s_lshr_b32 s0, s4, 16 422; VI-NEXT: v_mov_b32_e32 v3, s0 423; VI-NEXT: v_mov_b32_e32 v1, s1 424; VI-NEXT: s_waitcnt vmcnt(0) 425; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 426; VI-NEXT: v_mul_f16_e64 v2, |v2|, s4 427; VI-NEXT: v_or_b32_e32 v2, v2, v3 428; VI-NEXT: flat_store_dword v[0:1], v2 429; VI-NEXT: s_endpgm 430; 431; GFX9-LABEL: v_fabs_fold_v2f16: 432; GFX9: ; %bb.0: 433; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 434; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 435; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 436; GFX9-NEXT: v_mov_b32_e32 v1, 0 437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 438; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 439; GFX9-NEXT: s_waitcnt vmcnt(0) 440; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 441; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 442; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 443; GFX9-NEXT: s_endpgm 444 %tid = call i32 @llvm.amdgcn.workitem.id.x() 445 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 446 %val = load <2 x half>, <2 x half> addrspace(1)* %gep 447 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 448 %other.val.cvt = bitcast i32 %other.val to <2 x half> 449 %fmul = fmul <2 x half> %fabs, %other.val.cvt 450 store <2 x half> %fmul, <2 x half> addrspace(1)* %out 451 ret void 452} 453 454define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { 455; CI-LABEL: v_extract_fabs_fold_v2f16: 456; CI: ; %bb.0: 457; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 458; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 459; CI-NEXT: s_waitcnt lgkmcnt(0) 460; CI-NEXT: v_mov_b32_e32 v1, s1 461; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 462; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 463; CI-NEXT: flat_load_dword v0, v[0:1] 464; CI-NEXT: s_waitcnt vmcnt(0) 465; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 466; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 467; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 468; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0 469; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 470; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 471; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 472; CI-NEXT: flat_store_short v[0:1], v0 473; CI-NEXT: s_waitcnt vmcnt(0) 474; CI-NEXT: flat_store_short v[0:1], v1 475; CI-NEXT: s_waitcnt vmcnt(0) 476; CI-NEXT: s_endpgm 477; 478; VI-LABEL: v_extract_fabs_fold_v2f16: 479; VI: ; %bb.0: 480; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 481; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 482; VI-NEXT: s_waitcnt lgkmcnt(0) 483; VI-NEXT: v_mov_b32_e32 v1, s1 484; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 485; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 486; VI-NEXT: flat_load_dword v0, v[0:1] 487; VI-NEXT: v_mov_b32_e32 v1, 0x4000 488; VI-NEXT: s_waitcnt vmcnt(0) 489; VI-NEXT: v_mul_f16_e64 v2, |v0|, 4.0 490; VI-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 491; VI-NEXT: flat_store_short v[0:1], v2 492; VI-NEXT: s_waitcnt vmcnt(0) 493; VI-NEXT: flat_store_short v[0:1], v0 494; VI-NEXT: s_waitcnt vmcnt(0) 495; VI-NEXT: s_endpgm 496; 497; GFX9-LABEL: v_extract_fabs_fold_v2f16: 498; GFX9: ; %bb.0: 499; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 500; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 501; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 503; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 504; GFX9-NEXT: s_waitcnt vmcnt(0) 505; GFX9-NEXT: v_mul_f16_e64 v2, |v0|, 4.0 506; GFX9-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 507; GFX9-NEXT: global_store_short v[0:1], v2, off 508; GFX9-NEXT: s_waitcnt vmcnt(0) 509; GFX9-NEXT: global_store_short v[0:1], v0, off 510; GFX9-NEXT: s_waitcnt vmcnt(0) 511; GFX9-NEXT: s_endpgm 512 %tid = call i32 @llvm.amdgcn.workitem.id.x() 513 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 514 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in 515 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 516 %elt0 = extractelement <2 x half> %fabs, i32 0 517 %elt1 = extractelement <2 x half> %fabs, i32 1 518 519 %fmul0 = fmul half %elt0, 4.0 520 %fadd1 = fadd half %elt1, 2.0 521 store volatile half %fmul0, half addrspace(1)* undef 522 store volatile half %fadd1, half addrspace(1)* undef 523 ret void 524} 525 526define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { 527; CI-LABEL: v_extract_fabs_no_fold_v2f16: 528; CI: ; %bb.0: 529; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 530; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 531; CI-NEXT: s_waitcnt lgkmcnt(0) 532; CI-NEXT: v_mov_b32_e32 v1, s1 533; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 534; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 535; CI-NEXT: flat_load_dword v0, v[0:1] 536; CI-NEXT: s_waitcnt vmcnt(0) 537; CI-NEXT: v_bfe_u32 v1, v0, 16, 15 538; CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 539; CI-NEXT: flat_store_short v[0:1], v0 540; CI-NEXT: s_waitcnt vmcnt(0) 541; CI-NEXT: flat_store_short v[0:1], v1 542; CI-NEXT: s_waitcnt vmcnt(0) 543; CI-NEXT: s_endpgm 544; 545; VI-LABEL: v_extract_fabs_no_fold_v2f16: 546; VI: ; %bb.0: 547; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 548; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 549; VI-NEXT: s_waitcnt lgkmcnt(0) 550; VI-NEXT: v_mov_b32_e32 v1, s1 551; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 552; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 553; VI-NEXT: flat_load_dword v0, v[0:1] 554; VI-NEXT: s_waitcnt vmcnt(0) 555; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 556; VI-NEXT: v_bfe_u32 v0, v0, 16, 15 557; VI-NEXT: flat_store_short v[0:1], v1 558; VI-NEXT: s_waitcnt vmcnt(0) 559; VI-NEXT: flat_store_short v[0:1], v0 560; VI-NEXT: s_waitcnt vmcnt(0) 561; VI-NEXT: s_endpgm 562; 563; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: 564; GFX9: ; %bb.0: 565; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 566; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 567; GFX9-NEXT: s_waitcnt lgkmcnt(0) 568; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 569; GFX9-NEXT: s_waitcnt vmcnt(0) 570; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 571; GFX9-NEXT: global_store_short v[0:1], v0, off 572; GFX9-NEXT: s_waitcnt vmcnt(0) 573; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off 574; GFX9-NEXT: s_waitcnt vmcnt(0) 575; GFX9-NEXT: s_endpgm 576 %tid = call i32 @llvm.amdgcn.workitem.id.x() 577 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 578 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in 579 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 580 %elt0 = extractelement <2 x half> %fabs, i32 0 581 %elt1 = extractelement <2 x half> %fabs, i32 1 582 store volatile half %elt0, half addrspace(1)* undef 583 store volatile half %elt1, half addrspace(1)* undef 584 ret void 585} 586 587declare half @llvm.fabs.f16(half) #1 588declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 589declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 590declare i32 @llvm.amdgcn.workitem.id.x() #1 591 592attributes #0 = { nounwind } 593attributes #1 = { nounwind readnone } 594 595;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 596; GCN: {{.*}} 597; GFX89: {{.*}} 598