1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 3; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() #1 6 7define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 8; GCN-LABEL: v_pack_b32_v2f16: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 12; GCN-NEXT: s_waitcnt lgkmcnt(0) 13; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 14; GCN-NEXT: s_waitcnt vmcnt(0) 15; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 16; GCN-NEXT: s_waitcnt vmcnt(0) 17; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 18; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 19; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 20; GCN-NEXT: ;;#ASMSTART 21; GCN-NEXT: ; use v0 22; GCN-NEXT: ;;#ASMEND 23; GCN-NEXT: s_endpgm 24; 25; GISEL-LABEL: v_pack_b32_v2f16: 26; GISEL: ; %bb.0: 27; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 28; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 29; GISEL-NEXT: s_waitcnt lgkmcnt(0) 30; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 31; GISEL-NEXT: s_waitcnt vmcnt(0) 32; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 33; GISEL-NEXT: s_waitcnt vmcnt(0) 34; GISEL-NEXT: s_waitcnt_depctr 0xffe3 35; GISEL-NEXT: s_movk_i32 s0, 0x4000 36; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 37; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 38; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 39; GISEL-NEXT: ;;#ASMSTART 40; GISEL-NEXT: ; use v0 41; GISEL-NEXT: ;;#ASMEND 42; GISEL-NEXT: s_endpgm 43 %tid = call i32 @llvm.amdgcn.workitem.id.x() 44 %tid.ext = sext i32 %tid to i64 45 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 46 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 47 %v0 = load volatile half, half addrspace(1)* %in0.gep 48 %v1 = load volatile half, half addrspace(1)* %in1.gep 49 %v0.add = fadd half %v0, 2.0 50 %v1.add = fadd half %v1, 2.0 51 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 52 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 53 %vec.i32 = bitcast <2 x half> %vec.1 to i32 54 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 55 ret void 56} 57 58define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 59; GCN-LABEL: v_pack_b32_v2f16_sub: 60; GCN: ; %bb.0: 61; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 62; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 63; GCN-NEXT: s_waitcnt lgkmcnt(0) 64; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 65; GCN-NEXT: s_waitcnt vmcnt(0) 66; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 67; GCN-NEXT: s_waitcnt vmcnt(0) 68; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 69; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 70; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 71; GCN-NEXT: ;;#ASMSTART 72; GCN-NEXT: ; use v0 73; GCN-NEXT: ;;#ASMEND 74; GCN-NEXT: s_endpgm 75; 76; GISEL-LABEL: v_pack_b32_v2f16_sub: 77; GISEL: ; %bb.0: 78; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 79; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 80; GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 82; GISEL-NEXT: s_waitcnt vmcnt(0) 83; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 84; GISEL-NEXT: s_waitcnt vmcnt(0) 85; GISEL-NEXT: v_mov_b32_e32 v0, 0x4000 86; GISEL-NEXT: v_add_f16_e32 v1, -2.0, v1 87; GISEL-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 88; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 89; GISEL-NEXT: ;;#ASMSTART 90; GISEL-NEXT: ; use v0 91; GISEL-NEXT: ;;#ASMEND 92; GISEL-NEXT: s_endpgm 93 %tid = call i32 @llvm.amdgcn.workitem.id.x() 94 %tid.ext = sext i32 %tid to i64 95 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 96 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 97 %v0 = load volatile half, half addrspace(1)* %in0.gep 98 %v1 = load volatile half, half addrspace(1)* %in1.gep 99 %v0.add = fsub half %v0, 2.0 100 %v1.add = fadd half %v1, 2.0 101 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0 102 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1 103 %vec.i32 = bitcast <2 x half> %vec.1 to i32 104 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 105 ret void 106} 107 108define amdgpu_kernel void @fptrunc( 109; GCN-LABEL: fptrunc: 110; GCN: ; %bb.0: 111; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 112; GCN-NEXT: s_mov_b32 s6, -1 113; GCN-NEXT: s_mov_b32 s7, 0x31016000 114; GCN-NEXT: s_mov_b32 s10, s6 115; GCN-NEXT: s_mov_b32 s11, s7 116; GCN-NEXT: s_waitcnt lgkmcnt(0) 117; GCN-NEXT: s_mov_b32 s8, s2 118; GCN-NEXT: s_mov_b32 s9, s3 119; GCN-NEXT: s_mov_b32 s4, s0 120; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 121; GCN-NEXT: s_mov_b32 s5, s1 122; GCN-NEXT: s_waitcnt vmcnt(0) 123; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 124; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 125; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 126; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 127; GCN-NEXT: s_endpgm 128; 129; GISEL-LABEL: fptrunc: 130; GISEL: ; %bb.0: 131; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 132; GISEL-NEXT: s_waitcnt lgkmcnt(0) 133; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 134; GISEL-NEXT: s_waitcnt lgkmcnt(0) 135; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 136; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 137; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 138; GISEL-NEXT: v_mov_b32_e32 v1, 0 139; GISEL-NEXT: global_store_dword v1, v0, s[0:1] 140; GISEL-NEXT: s_endpgm 141 <2 x half> addrspace(1)* %r, 142 <2 x float> addrspace(1)* %a) { 143 %a.val = load <2 x float>, <2 x float> addrspace(1)* %a 144 %r.val = fptrunc <2 x float> %a.val to <2 x half> 145 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 146 ret void 147} 148 149define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 150; GCN-LABEL: v_pack_b32.fabs: 151; GCN: ; %bb.0: 152; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 153; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 154; GCN-NEXT: s_waitcnt lgkmcnt(0) 155; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 156; GCN-NEXT: s_waitcnt vmcnt(0) 157; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 158; GCN-NEXT: s_waitcnt vmcnt(0) 159; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 160; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 161; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1| 162; GCN-NEXT: ;;#ASMSTART 163; GCN-NEXT: ; use v0 164; GCN-NEXT: ;;#ASMEND 165; GCN-NEXT: s_endpgm 166; 167; GISEL-LABEL: v_pack_b32.fabs: 168; GISEL: ; %bb.0: 169; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 170; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 171; GISEL-NEXT: s_waitcnt lgkmcnt(0) 172; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 173; GISEL-NEXT: s_waitcnt vmcnt(0) 174; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 175; GISEL-NEXT: s_waitcnt vmcnt(0) 176; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff 177; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 178; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 179; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 180; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 181; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 182; GISEL-NEXT: ;;#ASMSTART 183; GISEL-NEXT: ; use v0 184; GISEL-NEXT: ;;#ASMEND 185; GISEL-NEXT: s_endpgm 186 %tid = call i32 @llvm.amdgcn.workitem.id.x() 187 %tid.ext = sext i32 %tid to i64 188 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 189 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 190 %v0 = load volatile half, half addrspace(1)* %in0.gep 191 %v1 = load volatile half, half addrspace(1)* %in1.gep 192 %v0.add = fadd half %v0, 2.0 193 %v1.add = fadd half %v1, 2.0 194 %v0.fabs = call half @llvm.fabs.f16(half %v0.add) 195 %v1.fabs = call half @llvm.fabs.f16(half %v1.add) 196 %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0 197 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1 198 %vec.i32 = bitcast <2 x half> %vec.1 to i32 199 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 200 ret void 201} 202 203define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 { 204; GCN-LABEL: v_pack_b32.fneg: 205; GCN: ; %bb.0: 206; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 207; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 208; GCN-NEXT: s_waitcnt lgkmcnt(0) 209; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 210; GCN-NEXT: s_waitcnt vmcnt(0) 211; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 212; GCN-NEXT: s_waitcnt vmcnt(0) 213; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 214; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 215; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1 216; GCN-NEXT: ;;#ASMSTART 217; GCN-NEXT: ; use v0 218; GCN-NEXT: ;;#ASMEND 219; GCN-NEXT: s_endpgm 220; 221; GISEL-LABEL: v_pack_b32.fneg: 222; GISEL: ; %bb.0: 223; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 224; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 225; GISEL-NEXT: s_waitcnt lgkmcnt(0) 226; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc 227; GISEL-NEXT: s_waitcnt vmcnt(0) 228; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc 229; GISEL-NEXT: s_waitcnt vmcnt(0) 230; GISEL-NEXT: s_waitcnt_depctr 0xffe3 231; GISEL-NEXT: s_mov_b32 s0, 0x8000 232; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 233; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 234; GISEL-NEXT: v_add_f16_e64 v0, 0x8000, -v0 235; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 236; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 237; GISEL-NEXT: ;;#ASMSTART 238; GISEL-NEXT: ; use v0 239; GISEL-NEXT: ;;#ASMEND 240; GISEL-NEXT: s_endpgm 241 %tid = call i32 @llvm.amdgcn.workitem.id.x() 242 %tid.ext = sext i32 %tid to i64 243 %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext 244 %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext 245 %v0 = load volatile half, half addrspace(1)* %in0.gep 246 %v1 = load volatile half, half addrspace(1)* %in1.gep 247 %v0.add = fadd half %v0, 2.0 248 %v1.add = fadd half %v1, 2.0 249 %v0.fneg = fsub half -0.0, %v0.add 250 %v1.fneg = fsub half -0.0, %v1.add 251 %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0 252 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1 253 %vec.i32 = bitcast <2 x half> %vec.1 to i32 254 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 255 ret void 256} 257 258declare half @llvm.fabs.f16(half) #1 259 260attributes #0 = { nounwind } 261attributes #1 = { nounwind readnone } 262 263