1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s 5 6declare void @extern_func() #2 7 8define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { 9; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be 10; preserved across the call and should get 8 scratch registers. 11; GFX9-LABEL: non_preserved_vgpr_tuple8: 12; GFX9: ; %bb.0: ; %main_body 13; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 15; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 16; GFX9-NEXT: s_mov_b64 exec, s[4:5] 17; GFX9-NEXT: s_mov_b32 s4, 0 18; GFX9-NEXT: v_writelane_b32 v40, s33, 2 19; GFX9-NEXT: s_mov_b32 s33, s32 20; GFX9-NEXT: v_mov_b32_e32 v36, v16 21; GFX9-NEXT: v_mov_b32_e32 v35, v15 22; GFX9-NEXT: v_mov_b32_e32 v34, v14 23; GFX9-NEXT: v_mov_b32_e32 v33, v13 24; GFX9-NEXT: v_mov_b32_e32 v32, v12 25; GFX9-NEXT: s_mov_b32 s5, s4 26; GFX9-NEXT: s_mov_b32 s6, s4 27; GFX9-NEXT: s_mov_b32 s7, s4 28; GFX9-NEXT: s_mov_b32 s8, s4 29; GFX9-NEXT: s_mov_b32 s9, s4 30; GFX9-NEXT: s_mov_b32 s10, s4 31; GFX9-NEXT: s_mov_b32 s11, s4 32; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 33; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 34; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 35; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill 36; GFX9-NEXT: ;;#ASMSTART 37; GFX9-NEXT: ;;#ASMEND 38; GFX9-NEXT: ;;#ASMSTART 39; GFX9-NEXT: ;;#ASMEND 40; GFX9-NEXT: ;;#ASMSTART 41; GFX9-NEXT: ;;#ASMEND 42; GFX9-NEXT: ;;#ASMSTART 43; GFX9-NEXT: ;;#ASMEND 44; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 45; GFX9-NEXT: s_addk_i32 s32, 0x800 46; GFX9-NEXT: s_getpc_b64 s[4:5] 47; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 48; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 49; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 50; GFX9-NEXT: v_writelane_b32 v40, s30, 0 51; GFX9-NEXT: v_writelane_b32 v40, s31, 1 52; GFX9-NEXT: s_waitcnt lgkmcnt(0) 53; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 54; GFX9-NEXT: v_mov_b32_e32 v0, v41 55; GFX9-NEXT: v_mov_b32_e32 v1, v42 56; GFX9-NEXT: v_mov_b32_e32 v2, v43 57; GFX9-NEXT: v_mov_b32_e32 v3, v44 58; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload 59; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 60; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload 61; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload 62; GFX9-NEXT: v_readlane_b32 s31, v40, 1 63; GFX9-NEXT: v_readlane_b32 s30, v40, 0 64; GFX9-NEXT: s_addk_i32 s32, 0xf800 65; GFX9-NEXT: v_readlane_b32 s33, v40, 2 66; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 67; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 68; GFX9-NEXT: s_mov_b64 exec, s[4:5] 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: s_setpc_b64 s[30:31] 71; 72; GFX10-LABEL: non_preserved_vgpr_tuple8: 73; GFX10: ; %bb.0: ; %main_body 74; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 76; GFX10-NEXT: s_or_saveexec_b32 s4, -1 77; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 78; GFX10-NEXT: s_waitcnt_depctr 0xffe3 79; GFX10-NEXT: s_mov_b32 exec_lo, s4 80; GFX10-NEXT: v_mov_b32_e32 v36, v16 81; GFX10-NEXT: v_mov_b32_e32 v35, v15 82; GFX10-NEXT: v_mov_b32_e32 v34, v14 83; GFX10-NEXT: v_mov_b32_e32 v33, v13 84; GFX10-NEXT: v_mov_b32_e32 v32, v12 85; GFX10-NEXT: s_mov_b32 s4, 0 86; GFX10-NEXT: v_writelane_b32 v40, s33, 2 87; GFX10-NEXT: s_mov_b32 s33, s32 88; GFX10-NEXT: s_mov_b32 s5, s4 89; GFX10-NEXT: s_mov_b32 s6, s4 90; GFX10-NEXT: s_mov_b32 s7, s4 91; GFX10-NEXT: s_mov_b32 s8, s4 92; GFX10-NEXT: s_mov_b32 s9, s4 93; GFX10-NEXT: s_mov_b32 s10, s4 94; GFX10-NEXT: s_mov_b32 s11, s4 95; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 96; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 97; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 98; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill 99; GFX10-NEXT: ;;#ASMSTART 100; GFX10-NEXT: ;;#ASMEND 101; GFX10-NEXT: ;;#ASMSTART 102; GFX10-NEXT: ;;#ASMEND 103; GFX10-NEXT: ;;#ASMSTART 104; GFX10-NEXT: ;;#ASMEND 105; GFX10-NEXT: ;;#ASMSTART 106; GFX10-NEXT: ;;#ASMEND 107; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 108; GFX10-NEXT: s_addk_i32 s32, 0x400 109; GFX10-NEXT: s_waitcnt_depctr 0xffe3 110; GFX10-NEXT: s_getpc_b64 s[4:5] 111; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 112; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 113; GFX10-NEXT: v_writelane_b32 v40, s30, 0 114; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 115; GFX10-NEXT: v_writelane_b32 v40, s31, 1 116; GFX10-NEXT: s_waitcnt lgkmcnt(0) 117; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] 118; GFX10-NEXT: v_mov_b32_e32 v0, v41 119; GFX10-NEXT: v_mov_b32_e32 v1, v42 120; GFX10-NEXT: v_mov_b32_e32 v2, v43 121; GFX10-NEXT: v_mov_b32_e32 v3, v44 122; GFX10-NEXT: s_clause 0x3 123; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 124; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 125; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 126; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 127; GFX10-NEXT: v_readlane_b32 s31, v40, 1 128; GFX10-NEXT: v_readlane_b32 s30, v40, 0 129; GFX10-NEXT: s_addk_i32 s32, 0xfc00 130; GFX10-NEXT: v_readlane_b32 s33, v40, 2 131; GFX10-NEXT: s_or_saveexec_b32 s4, -1 132; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 133; GFX10-NEXT: s_waitcnt_depctr 0xffe3 134; GFX10-NEXT: s_mov_b32 exec_lo, s4 135; GFX10-NEXT: s_waitcnt vmcnt(0) 136; GFX10-NEXT: s_setpc_b64 s[30:31] 137; 138; GFX11-LABEL: non_preserved_vgpr_tuple8: 139; GFX11: ; %bb.0: ; %main_body 140; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 142; GFX11-NEXT: s_or_saveexec_b32 s0, -1 143; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill 144; GFX11-NEXT: s_mov_b32 exec_lo, s0 145; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 146; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 147; GFX11-NEXT: v_mov_b32_e32 v32, v12 148; GFX11-NEXT: s_mov_b32 s0, 0 149; GFX11-NEXT: v_writelane_b32 v40, s33, 2 150; GFX11-NEXT: s_mov_b32 s33, s32 151; GFX11-NEXT: s_mov_b32 s1, s0 152; GFX11-NEXT: s_mov_b32 s2, s0 153; GFX11-NEXT: s_mov_b32 s3, s0 154; GFX11-NEXT: s_mov_b32 s4, s0 155; GFX11-NEXT: s_mov_b32 s5, s0 156; GFX11-NEXT: s_mov_b32 s6, s0 157; GFX11-NEXT: s_mov_b32 s7, s0 158; GFX11-NEXT: s_clause 0x3 159; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 160; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 161; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 162; GFX11-NEXT: scratch_store_b32 off, v44, s33 163; GFX11-NEXT: ;;#ASMSTART 164; GFX11-NEXT: ;;#ASMEND 165; GFX11-NEXT: ;;#ASMSTART 166; GFX11-NEXT: ;;#ASMEND 167; GFX11-NEXT: ;;#ASMSTART 168; GFX11-NEXT: ;;#ASMEND 169; GFX11-NEXT: ;;#ASMSTART 170; GFX11-NEXT: ;;#ASMEND 171; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D 172; GFX11-NEXT: s_add_i32 s32, s32, 32 173; GFX11-NEXT: s_getpc_b64 s[0:1] 174; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 175; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 176; GFX11-NEXT: v_writelane_b32 v40, s30, 0 177; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 178; GFX11-NEXT: v_writelane_b32 v40, s31, 1 179; GFX11-NEXT: s_waitcnt lgkmcnt(0) 180; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 181; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 182; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 183; GFX11-NEXT: s_clause 0x3 184; GFX11-NEXT: scratch_load_b32 v44, off, s33 185; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 186; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 187; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 188; GFX11-NEXT: v_readlane_b32 s31, v40, 1 189; GFX11-NEXT: v_readlane_b32 s30, v40, 0 190; GFX11-NEXT: s_addk_i32 s32, 0xffe0 191; GFX11-NEXT: v_readlane_b32 s33, v40, 2 192; GFX11-NEXT: s_or_saveexec_b32 s0, -1 193; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload 194; GFX11-NEXT: s_mov_b32 exec_lo, s0 195; GFX11-NEXT: s_waitcnt vmcnt(0) 196; GFX11-NEXT: s_setpc_b64 s[30:31] 197 198 199 200 201 202 203 204 205 206 207main_body: 208 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 209 call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 210 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 211 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 212 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 213 call void @extern_func() 214 ret <4 x float> %v 215} 216 217define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { 218; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved 219; across the call and should get allcoated to 8 CSRs. 220; Only the lower 5 sub-registers of the tuple are preserved. 221; The upper 3 sub-registers are unused. 222; GFX9-LABEL: call_preserved_vgpr_tuple8: 223; GFX9: ; %bb.0: ; %main_body 224; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 226; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 227; GFX9-NEXT: s_mov_b64 exec, s[4:5] 228; GFX9-NEXT: v_writelane_b32 v40, s33, 10 229; GFX9-NEXT: v_writelane_b32 v40, s30, 0 230; GFX9-NEXT: v_writelane_b32 v40, s31, 1 231; GFX9-NEXT: v_writelane_b32 v40, s36, 2 232; GFX9-NEXT: v_writelane_b32 v40, s37, 3 233; GFX9-NEXT: v_writelane_b32 v40, s38, 4 234; GFX9-NEXT: v_writelane_b32 v40, s39, 5 235; GFX9-NEXT: v_writelane_b32 v40, s40, 6 236; GFX9-NEXT: v_writelane_b32 v40, s41, 7 237; GFX9-NEXT: s_mov_b32 s33, s32 238; GFX9-NEXT: v_writelane_b32 v40, s42, 8 239; GFX9-NEXT: s_mov_b32 s36, 0 240; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill 241; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 242; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 243; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 244; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill 245; GFX9-NEXT: v_writelane_b32 v40, s43, 9 246; GFX9-NEXT: v_mov_b32_e32 v45, v16 247; GFX9-NEXT: v_mov_b32_e32 v44, v15 248; GFX9-NEXT: v_mov_b32_e32 v43, v14 249; GFX9-NEXT: v_mov_b32_e32 v42, v13 250; GFX9-NEXT: v_mov_b32_e32 v41, v12 251; GFX9-NEXT: s_mov_b32 s37, s36 252; GFX9-NEXT: s_mov_b32 s38, s36 253; GFX9-NEXT: s_mov_b32 s39, s36 254; GFX9-NEXT: s_mov_b32 s40, s36 255; GFX9-NEXT: s_mov_b32 s41, s36 256; GFX9-NEXT: s_mov_b32 s42, s36 257; GFX9-NEXT: s_mov_b32 s43, s36 258; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 259; GFX9-NEXT: s_addk_i32 s32, 0x800 260; GFX9-NEXT: s_getpc_b64 s[4:5] 261; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 262; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 263; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 264; GFX9-NEXT: s_waitcnt vmcnt(0) 265; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 267; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] 268; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 269; GFX9-NEXT: s_nop 0 270; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload 271; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 272; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload 273; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload 274; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload 275; GFX9-NEXT: v_readlane_b32 s43, v40, 9 276; GFX9-NEXT: v_readlane_b32 s42, v40, 8 277; GFX9-NEXT: v_readlane_b32 s41, v40, 7 278; GFX9-NEXT: v_readlane_b32 s40, v40, 6 279; GFX9-NEXT: v_readlane_b32 s39, v40, 5 280; GFX9-NEXT: v_readlane_b32 s38, v40, 4 281; GFX9-NEXT: v_readlane_b32 s37, v40, 3 282; GFX9-NEXT: v_readlane_b32 s36, v40, 2 283; GFX9-NEXT: v_readlane_b32 s31, v40, 1 284; GFX9-NEXT: v_readlane_b32 s30, v40, 0 285; GFX9-NEXT: s_addk_i32 s32, 0xf800 286; GFX9-NEXT: v_readlane_b32 s33, v40, 10 287; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 288; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload 289; GFX9-NEXT: s_mov_b64 exec, s[4:5] 290; GFX9-NEXT: s_waitcnt vmcnt(0) 291; GFX9-NEXT: s_setpc_b64 s[30:31] 292; 293; GFX10-LABEL: call_preserved_vgpr_tuple8: 294; GFX10: ; %bb.0: ; %main_body 295; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 296; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 297; GFX10-NEXT: s_or_saveexec_b32 s4, -1 298; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 299; GFX10-NEXT: s_waitcnt_depctr 0xffe3 300; GFX10-NEXT: s_mov_b32 exec_lo, s4 301; GFX10-NEXT: v_writelane_b32 v40, s33, 10 302; GFX10-NEXT: s_mov_b32 s33, s32 303; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill 304; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill 305; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill 306; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 307; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill 308; GFX10-NEXT: s_addk_i32 s32, 0x400 309; GFX10-NEXT: v_writelane_b32 v40, s30, 0 310; GFX10-NEXT: v_mov_b32_e32 v41, v16 311; GFX10-NEXT: v_mov_b32_e32 v42, v15 312; GFX10-NEXT: v_mov_b32_e32 v43, v14 313; GFX10-NEXT: v_mov_b32_e32 v44, v13 314; GFX10-NEXT: v_writelane_b32 v40, s31, 1 315; GFX10-NEXT: v_mov_b32_e32 v45, v12 316; GFX10-NEXT: v_writelane_b32 v40, s36, 2 317; GFX10-NEXT: s_mov_b32 s36, 0 318; GFX10-NEXT: v_writelane_b32 v40, s37, 3 319; GFX10-NEXT: s_mov_b32 s37, s36 320; GFX10-NEXT: v_writelane_b32 v40, s38, 4 321; GFX10-NEXT: s_mov_b32 s38, s36 322; GFX10-NEXT: v_writelane_b32 v40, s39, 5 323; GFX10-NEXT: s_mov_b32 s39, s36 324; GFX10-NEXT: v_writelane_b32 v40, s40, 6 325; GFX10-NEXT: s_mov_b32 s40, s36 326; GFX10-NEXT: v_writelane_b32 v40, s41, 7 327; GFX10-NEXT: s_mov_b32 s41, s36 328; GFX10-NEXT: v_writelane_b32 v40, s42, 8 329; GFX10-NEXT: s_mov_b32 s42, s36 330; GFX10-NEXT: v_writelane_b32 v40, s43, 9 331; GFX10-NEXT: s_mov_b32 s43, s36 332; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 333; GFX10-NEXT: s_waitcnt_depctr 0xffe3 334; GFX10-NEXT: s_getpc_b64 s[4:5] 335; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 336; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 337; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 338; GFX10-NEXT: s_waitcnt vmcnt(0) 339; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 340; GFX10-NEXT: s_waitcnt lgkmcnt(0) 341; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] 342; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D 343; GFX10-NEXT: s_clause 0x4 344; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 345; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 346; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 347; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 348; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 349; GFX10-NEXT: v_readlane_b32 s43, v40, 9 350; GFX10-NEXT: v_readlane_b32 s42, v40, 8 351; GFX10-NEXT: v_readlane_b32 s41, v40, 7 352; GFX10-NEXT: v_readlane_b32 s40, v40, 6 353; GFX10-NEXT: v_readlane_b32 s39, v40, 5 354; GFX10-NEXT: v_readlane_b32 s38, v40, 4 355; GFX10-NEXT: v_readlane_b32 s37, v40, 3 356; GFX10-NEXT: v_readlane_b32 s36, v40, 2 357; GFX10-NEXT: v_readlane_b32 s31, v40, 1 358; GFX10-NEXT: v_readlane_b32 s30, v40, 0 359; GFX10-NEXT: s_addk_i32 s32, 0xfc00 360; GFX10-NEXT: v_readlane_b32 s33, v40, 10 361; GFX10-NEXT: s_or_saveexec_b32 s4, -1 362; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload 363; GFX10-NEXT: s_waitcnt_depctr 0xffe3 364; GFX10-NEXT: s_mov_b32 exec_lo, s4 365; GFX10-NEXT: s_waitcnt vmcnt(0) 366; GFX10-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX11-LABEL: call_preserved_vgpr_tuple8: 369; GFX11: ; %bb.0: ; %main_body 370; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 372; GFX11-NEXT: s_or_saveexec_b32 s0, -1 373; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill 374; GFX11-NEXT: s_mov_b32 exec_lo, s0 375; GFX11-NEXT: v_writelane_b32 v40, s33, 10 376; GFX11-NEXT: s_mov_b32 s33, s32 377; GFX11-NEXT: s_clause 0x4 378; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 379; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 380; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 381; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 382; GFX11-NEXT: scratch_store_b32 off, v45, s33 383; GFX11-NEXT: s_add_i32 s32, s32, 32 384; GFX11-NEXT: v_writelane_b32 v40, s30, 0 385; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 386; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 387; GFX11-NEXT: v_writelane_b32 v40, s31, 1 388; GFX11-NEXT: v_mov_b32_e32 v45, v12 389; GFX11-NEXT: v_writelane_b32 v40, s36, 2 390; GFX11-NEXT: s_mov_b32 s36, 0 391; GFX11-NEXT: v_writelane_b32 v40, s37, 3 392; GFX11-NEXT: s_mov_b32 s37, s36 393; GFX11-NEXT: v_writelane_b32 v40, s38, 4 394; GFX11-NEXT: s_mov_b32 s38, s36 395; GFX11-NEXT: v_writelane_b32 v40, s39, 5 396; GFX11-NEXT: s_mov_b32 s39, s36 397; GFX11-NEXT: v_writelane_b32 v40, s40, 6 398; GFX11-NEXT: s_mov_b32 s40, s36 399; GFX11-NEXT: v_writelane_b32 v40, s41, 7 400; GFX11-NEXT: s_mov_b32 s41, s36 401; GFX11-NEXT: v_writelane_b32 v40, s42, 8 402; GFX11-NEXT: s_mov_b32 s42, s36 403; GFX11-NEXT: v_writelane_b32 v40, s43, 9 404; GFX11-NEXT: s_mov_b32 s43, s36 405; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D 406; GFX11-NEXT: s_getpc_b64 s[0:1] 407; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 408; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 409; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 410; GFX11-NEXT: s_waitcnt vmcnt(0) 411; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off 412; GFX11-NEXT: s_waitcnt lgkmcnt(0) 413; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 414; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D 415; GFX11-NEXT: s_clause 0x4 416; GFX11-NEXT: scratch_load_b32 v45, off, s33 417; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 418; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 419; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 420; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 421; GFX11-NEXT: v_readlane_b32 s43, v40, 9 422; GFX11-NEXT: v_readlane_b32 s42, v40, 8 423; GFX11-NEXT: v_readlane_b32 s41, v40, 7 424; GFX11-NEXT: v_readlane_b32 s40, v40, 6 425; GFX11-NEXT: v_readlane_b32 s39, v40, 5 426; GFX11-NEXT: v_readlane_b32 s38, v40, 4 427; GFX11-NEXT: v_readlane_b32 s37, v40, 3 428; GFX11-NEXT: v_readlane_b32 s36, v40, 2 429; GFX11-NEXT: v_readlane_b32 s31, v40, 1 430; GFX11-NEXT: v_readlane_b32 s30, v40, 0 431; GFX11-NEXT: s_addk_i32 s32, 0xffe0 432; GFX11-NEXT: v_readlane_b32 s33, v40, 10 433; GFX11-NEXT: s_or_saveexec_b32 s0, -1 434; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload 435; GFX11-NEXT: s_mov_b32 exec_lo, s0 436; GFX11-NEXT: s_waitcnt vmcnt(0) 437; GFX11-NEXT: s_setpc_b64 s[30:31] 438 439 440 441 442 443 444 445main_body: 446 %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 447 store <4 x float> %v, <4 x float> addrspace(1)* undef 448 call void @extern_func() 449 %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) 450 ret <4 x float> %v1 451} 452 453declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 454 455attributes #0 = { nounwind writeonly } 456attributes #1 = { nounwind readonly } 457attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 458