1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 3; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() #1 6 7define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 8; GCN-LABEL: v_pack_b32_v2f16: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 12; GCN-NEXT: s_waitcnt lgkmcnt(0) 13; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 14; GCN-NEXT: s_waitcnt vmcnt(0) 15; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 16; GCN-NEXT: s_waitcnt vmcnt(0) 17; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 18; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 19; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 20; GCN-NEXT: ;;#ASMSTART 21; GCN-NEXT: ; use v0 22; GCN-NEXT: ;;#ASMEND 23; GCN-NEXT: s_endpgm 24; 25; GISEL-LABEL: v_pack_b32_v2f16: 26; GISEL: ; %bb.0: 27; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 28; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 29; GISEL-NEXT: s_waitcnt lgkmcnt(0) 30; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 31; GISEL-NEXT: s_waitcnt vmcnt(0) 32; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 33; GISEL-NEXT: s_waitcnt vmcnt(0) 34; GISEL-NEXT: s_waitcnt_depctr 0xffe3 35; GISEL-NEXT: s_movk_i32 s0, 0x4000 36; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 37; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 38; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 39; GISEL-NEXT: ;;#ASMSTART 40; GISEL-NEXT: ; use v0 41; GISEL-NEXT: ;;#ASMEND 42; GISEL-NEXT: s_endpgm 43 %tid = call i32 @llvm.amdgcn.workitem.id.x() 44 %tid.ext = sext i32 %tid to i64 45 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 46 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 47 %v0 = load volatile half, half addrspace(1)* %in0.gep 48 %v1 = load volatile half, half addrspace(1)* %in1.gep 49 %v0.add = fadd half %v0, 2.0 50 %v1.add = fadd half %v1, 2.0 51 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 52 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 53 %vec.i32 = bitcast <2 x half> %vec.1 to i32 54 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 55 ret void 56} 57 58define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 59; GCN-LABEL: v_pack_b32_v2f16_sub: 60; GCN: ; %bb.0: 61; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 62; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 63; GCN-NEXT: s_waitcnt lgkmcnt(0) 64; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 65; GCN-NEXT: s_waitcnt vmcnt(0) 66; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 67; GCN-NEXT: s_waitcnt vmcnt(0) 68; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 69; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 70; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 71; GCN-NEXT: ;;#ASMSTART 72; GCN-NEXT: ; use v0 73; GCN-NEXT: ;;#ASMEND 74; GCN-NEXT: s_endpgm 75; 76; GISEL-LABEL: v_pack_b32_v2f16_sub: 77; GISEL: ; %bb.0: 78; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 79; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 80; GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 82; GISEL-NEXT: s_waitcnt vmcnt(0) 83; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 84; GISEL-NEXT: s_waitcnt vmcnt(0) 85; GISEL-NEXT: s_waitcnt_depctr 0xffe3 86; GISEL-NEXT: s_movk_i32 s0, 0x4000 87; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1 88; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 89; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 90; GISEL-NEXT: ;;#ASMSTART 91; GISEL-NEXT: ; use v0 92; GISEL-NEXT: ;;#ASMEND 93; GISEL-NEXT: s_endpgm 94 %tid = call i32 @llvm.amdgcn.workitem.id.x() 95 %tid.ext = sext i32 %tid to i64 96 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 97 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 98 %v0 = load volatile half, half addrspace(1)* %in0.gep 99 %v1 = load volatile half, half addrspace(1)* %in1.gep 100 %v0.add = fsub half %v0, 2.0 101 %v1.add = fadd half %v1, 2.0 102 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 103 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 104 %vec.i32 = bitcast <2 x half> %vec.1 to i32 105 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 106 ret void 107} 108 109define amdgpu_kernel void @fptrunc( 110; GCN-LABEL: fptrunc: 111; GCN: ; %bb.0: 112; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 113; GCN-NEXT: s_mov_b32 s6, -1 114; GCN-NEXT: s_mov_b32 s7, 0x31016000 115; GCN-NEXT: s_mov_b32 s10, s6 116; GCN-NEXT: s_mov_b32 s11, s7 117; GCN-NEXT: s_waitcnt lgkmcnt(0) 118; GCN-NEXT: s_mov_b32 s8, s2 119; GCN-NEXT: s_mov_b32 s9, s3 120; GCN-NEXT: s_mov_b32 s4, s0 121; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 122; GCN-NEXT: s_mov_b32 s5, s1 123; GCN-NEXT: s_waitcnt vmcnt(0) 124; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 125; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 126; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 127; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 128; GCN-NEXT: s_endpgm 129; 130; GISEL-LABEL: fptrunc: 131; GISEL: ; %bb.0: 132; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 133; GISEL-NEXT: s_waitcnt lgkmcnt(0) 134; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 135; GISEL-NEXT: s_waitcnt lgkmcnt(0) 136; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 137; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 138; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 139; GISEL-NEXT: v_mov_b32_e32 v1, 0 140; GISEL-NEXT: global_store_dword v1, v0, s[0:1] 141; GISEL-NEXT: s_endpgm 142 <2 x half> addrspace(1)* %r, 143 <2 x float> addrspace(1)* %a) { 144 %a.val = load <2 x float>, <2 x float> addrspace(1)* %a 145 %r.val = fptrunc <2 x float> %a.val to <2 x half> 146 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 147 ret void 148} 149 150define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 151; GCN-LABEL: v_pack_b32.fabs: 152; GCN: ; %bb.0: 153; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 154; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 155; GCN-NEXT: s_waitcnt lgkmcnt(0) 156; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 157; GCN-NEXT: s_waitcnt vmcnt(0) 158; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 159; GCN-NEXT: s_waitcnt vmcnt(0) 160; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 161; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 162; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1| 163; GCN-NEXT: ;;#ASMSTART 164; GCN-NEXT: ; use v0 165; GCN-NEXT: ;;#ASMEND 166; GCN-NEXT: s_endpgm 167; 168; GISEL-LABEL: v_pack_b32.fabs: 169; GISEL: ; %bb.0: 170; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 171; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 172; GISEL-NEXT: s_waitcnt lgkmcnt(0) 173; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 174; GISEL-NEXT: s_waitcnt vmcnt(0) 175; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 176; GISEL-NEXT: s_waitcnt vmcnt(0) 177; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff 178; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 179; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 180; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 181; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 182; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 183; GISEL-NEXT: ;;#ASMSTART 184; GISEL-NEXT: ; use v0 185; GISEL-NEXT: ;;#ASMEND 186; GISEL-NEXT: s_endpgm 187 %tid = call i32 @llvm.amdgcn.workitem.id.x() 188 %tid.ext = sext i32 %tid to i64 189 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 190 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 191 %v0 = load volatile half, half addrspace(1)* %in0.gep 192 %v1 = load volatile half, half addrspace(1)* %in1.gep 193 %v0.add = fadd half %v0, 2.0 194 %v1.add = fadd half %v1, 2.0 195 %v0.fabs = call half @llvm.fabs.f16(half %v0.add) 196 %v1.fabs = call half @llvm.fabs.f16(half %v1.add) 197 %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0 198 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1 199 %vec.i32 = bitcast <2 x half> %vec.1 to i32 200 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 201 ret void 202} 203 204define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 205; GCN-LABEL: v_pack_b32.fneg: 206; GCN: ; %bb.0: 207; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 208; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 209; GCN-NEXT: s_waitcnt lgkmcnt(0) 210; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 211; GCN-NEXT: s_waitcnt vmcnt(0) 212; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 213; GCN-NEXT: s_waitcnt vmcnt(0) 214; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 215; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 216; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1 217; GCN-NEXT: ;;#ASMSTART 218; GCN-NEXT: ; use v0 219; GCN-NEXT: ;;#ASMEND 220; GCN-NEXT: s_endpgm 221; 222; GISEL-LABEL: v_pack_b32.fneg: 223; GISEL: ; %bb.0: 224; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 225; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 226; GISEL-NEXT: s_waitcnt lgkmcnt(0) 227; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 228; GISEL-NEXT: s_waitcnt vmcnt(0) 229; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 230; GISEL-NEXT: s_waitcnt vmcnt(0) 231; GISEL-NEXT: s_waitcnt_depctr 0xffe3 232; GISEL-NEXT: s_mov_b32 s0, 0x8000 233; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 234; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 235; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0 236; GISEL-NEXT: v_sub_f16_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 237; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 238; GISEL-NEXT: ;;#ASMSTART 239; GISEL-NEXT: ; use v0 240; GISEL-NEXT: ;;#ASMEND 241; GISEL-NEXT: s_endpgm 242 %tid = call i32 @llvm.amdgcn.workitem.id.x() 243 %tid.ext = sext i32 %tid to i64 244 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 245 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 246 %v0 = load volatile half, half addrspace(1)* %in0.gep 247 %v1 = load volatile half, half addrspace(1)* %in1.gep 248 %v0.add = fadd half %v0, 2.0 249 %v1.add = fadd half %v1, 2.0 250 %v0.fneg = fsub half -0.0, %v0.add 251 %v1.fneg = fsub half -0.0, %v1.add 252 %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0 253 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1 254 %vec.i32 = bitcast <2 x half> %vec.1 to i32 255 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 256 ret void 257} 258 259declare half @llvm.fabs.f16(half) #1 260 261attributes #0 = { nounwind } 262attributes #1 = { nounwind readnone } 263 264