1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 3 4declare void @extern_func() #2 5 6define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { 7; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be 8; preserved across the call and should get 8 scratch registers. 9 10; GFX9-LABEL: non_preserved_vgpr_tuple8: 11; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 12 13; GFX9: v_mov_b32_e32 v36, v16 14; GFX9-NEXT: v_mov_b32_e32 v35, v15 15; GFX9-NEXT: v_mov_b32_e32 v34, v14 16; GFX9-NEXT: v_mov_b32_e32 v33, v13 17; GFX9-NEXT: v_mov_b32_e32 v32, v12 18 19; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 20; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 21; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 22; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill 23 24; GFX9: ;;#ASMSTART 25; GFX9-NEXT: ;;#ASMEND 26; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 27; GFX9-NEXT: s_addk_i32 s32, 0x800 28; GFX9-NEXT: s_getpc_b64 s[4:5] 29; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 30; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 31; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 32; GFX9-NEXT: v_writelane_b32 v40, s30, 0 33; GFX9: s_waitcnt lgkmcnt(0) 34; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 35 36; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload 37; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 38; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload 39; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload 40; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 41; GFX9: s_setpc_b64 s[4:5] 42; 43; GFX10-LABEL: non_preserved_vgpr_tuple8: 44; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 45 46; GFX10: v_mov_b32_e32 v36, v16 47; GFX10-NEXT: v_mov_b32_e32 v35, v15 48; GFX10-NEXT: v_mov_b32_e32 v34, v14 49; GFX10-NEXT: v_mov_b32_e32 v33, v13 50; GFX10-NEXT: v_mov_b32_e32 v32, v12 51 52; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 53; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 54; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 55; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill 56 57; GFX10: ;;#ASMSTART 58; GFX10-NEXT: ;;#ASMEND 59 60; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 61; GFX10-NEXT: s_addk_i32 s32, 0x400 62; GFX10-NEXT: s_waitcnt_depctr 0xffe3 63; GFX10-NEXT: s_getpc_b64 s[4:5] 64; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 65; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 66; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0 67; GFX10: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] 69 70; GFX10: buffer_load_dword v44, off, s[0:3], s33 71; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 72; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 73; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 74 75; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 76; GFX10: s_setpc_b64 s[4:5] 77main_body: 78 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 79 call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 80 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 81 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 82 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 83 call void @extern_func() 84 ret <4 x float> %v 85} 86 87define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { 88; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved 89; across the call and should get allcoated to 8 CSRs. 90; Only the lower 5 sub-registers of the tuple are preserved. 91; The upper 3 sub-registers are unused. 92 93; GFX9-LABEL: call_preserved_vgpr_tuple8: 94; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 95; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill 96; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 97; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 98; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 99; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill 100 101; GFX9: v_mov_b32_e32 v45, v16 102; GFX9-NEXT: v_mov_b32_e32 v44, v15 103; GFX9-NEXT: v_mov_b32_e32 v43, v14 104; GFX9-NEXT: v_mov_b32_e32 v42, v13 105; GFX9-NEXT: v_mov_b32_e32 v41, v12 106 107; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 108; GFX9-NEXT: s_addk_i32 s32, 0x800 109; GFX9-NEXT: s_getpc_b64 s[4:5] 110; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 111; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 112; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 113; GFX9: s_waitcnt vmcnt(0) 114; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 116; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 117; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 118 119; GFX9: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload 120; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 121; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload 122; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload 123; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload 124 125; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload 126; GFX9: s_setpc_b64 s[4:5] 127; 128; GFX10-LABEL: call_preserved_vgpr_tuple8: 129; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 130; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill 131; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 132; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 133; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 134; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill 135 136 137; GFX10: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 138; GFX10-NEXT: s_waitcnt_depctr 0xffe3 139; GFX10-NEXT: s_getpc_b64 s[4:5] 140; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 141; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 142; GFX10-NEXT: v_writelane_b32 v40, s30, 8 143; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 144; GFX10-NEXT: v_writelane_b32 v40, s31, 9 145; GFX10-NEXT: s_waitcnt vmcnt(0) 146; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 147; GFX10-NEXT: s_waitcnt lgkmcnt(0) 148; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] 149; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 150 151; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}} 152; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 153; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 154; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 155; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 156; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20 157; GFX10: s_setpc_b64 s[4:5] 158main_body: 159 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 160 store <4 x float> %v, <4 x float> addrspace(1)* undef 161 call void @extern_func() 162 %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 163 ret <4 x float> %v1 164} 165 166declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 167 168attributes #0 = { nounwind writeonly } 169attributes #1 = { nounwind readonly } 170attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 171