1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s 3; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s 4 5; Test end-to-end codegen for outgoing arguments passed on the 6; stack. This test is likely redundant when all DAG and GlobalISel 7; tests are unified. 8 9declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0 10declare hidden void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32])) #0 11 12define amdgpu_kernel void @kernel_caller_stack() { 13; MUBUF-LABEL: kernel_caller_stack: 14; MUBUF: ; %bb.0: 15; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 16; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 17; MUBUF-NEXT: s_add_u32 s0, s0, s7 18; MUBUF-NEXT: s_mov_b32 s32, 0 19; MUBUF-NEXT: s_addc_u32 s1, s1, 0 20; MUBUF-NEXT: v_mov_b32_e32 v0, 9 21; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 22; MUBUF-NEXT: v_mov_b32_e32 v0, 10 23; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 24; MUBUF-NEXT: v_mov_b32_e32 v0, 11 25; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 26; MUBUF-NEXT: v_mov_b32_e32 v0, 12 27; MUBUF-NEXT: s_getpc_b64 s[4:5] 28; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 29; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 30; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 31; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 32; MUBUF-NEXT: s_endpgm 33; 34; FLATSCR-LABEL: kernel_caller_stack: 35; FLATSCR: ; %bb.0: 36; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 37; FLATSCR-NEXT: s_mov_b32 s32, 0 38; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 39; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 40; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 41; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 42; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 43; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 44; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12 45; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 46; FLATSCR-NEXT: s_getpc_b64 s[0:1] 47; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 48; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 49; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16 50; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 51; FLATSCR-NEXT: s_endpgm 52 call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>) 53 ret void 54} 55 56define amdgpu_kernel void @kernel_caller_byval() { 57; MUBUF-LABEL: kernel_caller_byval: 58; MUBUF: ; %bb.0: 59; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 60; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 61; MUBUF-NEXT: s_add_u32 s0, s0, s7 62; MUBUF-NEXT: s_addc_u32 s1, s1, 0 63; MUBUF-NEXT: v_mov_b32_e32 v0, 0 64; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 65; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 66; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 67; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 68; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24 69; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28 70; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32 71; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 72; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 73; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44 74; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48 75; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52 76; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56 77; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60 78; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 79; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 80; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72 81; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76 82; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80 83; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84 84; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88 85; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92 86; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96 87; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100 88; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104 89; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108 90; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112 91; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116 92; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 93; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 94; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 95; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132 96; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 97; MUBUF-NEXT: s_nop 0 98; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 99; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16 100; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20 101; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24 102; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28 103; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32 104; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36 105; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40 106; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44 107; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48 108; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52 109; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56 110; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60 111; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64 112; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68 113; MUBUF-NEXT: s_movk_i32 s32, 0x1400 114; MUBUF-NEXT: s_getpc_b64 s[4:5] 115; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 116; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 117; MUBUF-NEXT: s_waitcnt vmcnt(15) 118; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 119; MUBUF-NEXT: s_waitcnt vmcnt(15) 120; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 121; MUBUF-NEXT: s_waitcnt vmcnt(15) 122; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 123; MUBUF-NEXT: s_waitcnt vmcnt(15) 124; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 125; MUBUF-NEXT: s_waitcnt vmcnt(15) 126; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 127; MUBUF-NEXT: s_waitcnt vmcnt(15) 128; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 129; MUBUF-NEXT: s_waitcnt vmcnt(15) 130; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 131; MUBUF-NEXT: s_waitcnt vmcnt(15) 132; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 133; MUBUF-NEXT: s_waitcnt vmcnt(15) 134; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 135; MUBUF-NEXT: s_waitcnt vmcnt(15) 136; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 137; MUBUF-NEXT: s_waitcnt vmcnt(15) 138; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 139; MUBUF-NEXT: s_waitcnt vmcnt(15) 140; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 141; MUBUF-NEXT: s_waitcnt vmcnt(15) 142; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 143; MUBUF-NEXT: s_waitcnt vmcnt(15) 144; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 145; MUBUF-NEXT: s_waitcnt vmcnt(15) 146; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 147; MUBUF-NEXT: s_waitcnt vmcnt(15) 148; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 149; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 150; MUBUF-NEXT: s_endpgm 151; 152; FLATSCR-LABEL: kernel_caller_byval: 153; FLATSCR: ; %bb.0: 154; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 155; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 156; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 157; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 158; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 159; FLATSCR-NEXT: s_mov_b32 s33, 0 160; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:8 161; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 162; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:72 163; FLATSCR-NEXT: s_mov_b32 s33, 0 164; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16 165; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 166; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:80 167; FLATSCR-NEXT: s_mov_b32 s33, 0 168; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:24 169; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 170; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:88 171; FLATSCR-NEXT: s_mov_b32 s33, 0 172; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:32 173; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 174; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:96 175; FLATSCR-NEXT: s_mov_b32 s33, 0 176; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40 177; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 178; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:104 179; FLATSCR-NEXT: s_mov_b32 s33, 0 180; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:48 181; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 182; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:112 183; FLATSCR-NEXT: s_mov_b32 s33, 0 184; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:56 185; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 186; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:120 187; FLATSCR-NEXT: s_mov_b32 s33, 0 188; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:64 189; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:128 190; FLATSCR-NEXT: s_mov_b32 s33, 0 191; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s33 offset:8 192; FLATSCR-NEXT: s_mov_b32 s33, 0 193; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s33 offset:16 194; FLATSCR-NEXT: s_mov_b32 s33, 0 195; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s33 offset:24 196; FLATSCR-NEXT: s_mov_b32 s33, 0 197; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s33 offset:32 198; FLATSCR-NEXT: s_mov_b32 s33, 0 199; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s33 offset:40 200; FLATSCR-NEXT: s_mov_b32 s33, 0 201; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s33 offset:48 202; FLATSCR-NEXT: s_mov_b32 s33, 0 203; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s33 offset:56 204; FLATSCR-NEXT: s_mov_b32 s33, 0 205; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s33 offset:64 206; FLATSCR-NEXT: s_movk_i32 s32, 0x50 207; FLATSCR-NEXT: s_getpc_b64 s[0:1] 208; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 209; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 210; FLATSCR-NEXT: s_waitcnt vmcnt(7) 211; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 212; FLATSCR-NEXT: s_waitcnt vmcnt(7) 213; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s32 offset:8 214; FLATSCR-NEXT: s_waitcnt vmcnt(7) 215; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 216; FLATSCR-NEXT: s_waitcnt vmcnt(7) 217; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s32 offset:24 218; FLATSCR-NEXT: s_waitcnt vmcnt(7) 219; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s32 offset:32 220; FLATSCR-NEXT: s_waitcnt vmcnt(7) 221; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s32 offset:40 222; FLATSCR-NEXT: s_waitcnt vmcnt(7) 223; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s32 offset:48 224; FLATSCR-NEXT: s_waitcnt vmcnt(7) 225; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s32 offset:56 226; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 227; FLATSCR-NEXT: s_endpgm 228 %alloca = alloca [16 x i32], align 4, addrspace(5) 229 %cast = bitcast [16 x i32] addrspace(5)* %alloca to i8 addrspace(5)* 230 call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %cast, i8 0, i32 128, i1 false) 231 call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %alloca) 232 ret void 233} 234 235define void @func_caller_stack() { 236; MUBUF-LABEL: func_caller_stack: 237; MUBUF: ; %bb.0: 238; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 239; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 240; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill 241; MUBUF-NEXT: s_mov_b64 exec, s[4:5] 242; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 243; MUBUF-NEXT: s_mov_b32 s33, s32 244; MUBUF-NEXT: s_addk_i32 s32, 0x400 245; MUBUF-NEXT: v_mov_b32_e32 v0, 9 246; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 247; MUBUF-NEXT: v_mov_b32_e32 v0, 10 248; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 249; MUBUF-NEXT: v_mov_b32_e32 v0, 11 250; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 251; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 252; MUBUF-NEXT: v_mov_b32_e32 v0, 12 253; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 254; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 255; MUBUF-NEXT: s_getpc_b64 s[4:5] 256; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 257; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 258; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 259; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 260; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 261; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 262; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 263; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 264; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload 265; MUBUF-NEXT: s_mov_b64 exec, s[4:5] 266; MUBUF-NEXT: s_waitcnt vmcnt(0) 267; MUBUF-NEXT: s_setpc_b64 s[30:31] 268; 269; FLATSCR-LABEL: func_caller_stack: 270; FLATSCR: ; %bb.0: 271; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 272; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 273; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill 274; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] 275; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 276; FLATSCR-NEXT: s_mov_b32 s33, s32 277; FLATSCR-NEXT: s_add_i32 s32, s32, 16 278; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 279; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 280; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 281; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 282; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 283; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 284; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:12 285; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 286; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 287; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:16 288; FLATSCR-NEXT: s_getpc_b64 s[0:1] 289; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 290; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 291; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 292; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 293; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 294; FLATSCR-NEXT: s_add_i32 s32, s32, -16 295; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 296; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 297; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload 298; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] 299; FLATSCR-NEXT: s_waitcnt vmcnt(0) 300; FLATSCR-NEXT: s_setpc_b64 s[30:31] 301 call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>) 302 ret void 303} 304 305define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) { 306; MUBUF-LABEL: func_caller_byval: 307; MUBUF: ; %bb.0: 308; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 309; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 310; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill 311; MUBUF-NEXT: s_mov_b64 exec, s[4:5] 312; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 313; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 314; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 315; MUBUF-NEXT: s_mov_b32 s33, s32 316; MUBUF-NEXT: s_addk_i32 s32, 0x400 317; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 318; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 319; MUBUF-NEXT: s_getpc_b64 s[4:5] 320; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 321; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 322; MUBUF-NEXT: s_waitcnt vmcnt(1) 323; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 324; MUBUF-NEXT: s_waitcnt vmcnt(1) 325; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 326; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8 327; MUBUF-NEXT: s_nop 0 328; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12 329; MUBUF-NEXT: s_waitcnt vmcnt(1) 330; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 331; MUBUF-NEXT: s_waitcnt vmcnt(1) 332; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 333; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16 334; MUBUF-NEXT: s_nop 0 335; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20 336; MUBUF-NEXT: s_waitcnt vmcnt(1) 337; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 338; MUBUF-NEXT: s_waitcnt vmcnt(1) 339; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 340; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24 341; MUBUF-NEXT: s_nop 0 342; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28 343; MUBUF-NEXT: s_waitcnt vmcnt(1) 344; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 345; MUBUF-NEXT: s_waitcnt vmcnt(1) 346; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 347; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32 348; MUBUF-NEXT: s_nop 0 349; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36 350; MUBUF-NEXT: s_waitcnt vmcnt(1) 351; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 352; MUBUF-NEXT: s_waitcnt vmcnt(1) 353; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 354; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40 355; MUBUF-NEXT: s_nop 0 356; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44 357; MUBUF-NEXT: s_waitcnt vmcnt(1) 358; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 359; MUBUF-NEXT: s_waitcnt vmcnt(1) 360; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 361; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48 362; MUBUF-NEXT: s_nop 0 363; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52 364; MUBUF-NEXT: s_waitcnt vmcnt(1) 365; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48 366; MUBUF-NEXT: s_waitcnt vmcnt(1) 367; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 368; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56 369; MUBUF-NEXT: s_nop 0 370; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60 371; MUBUF-NEXT: s_waitcnt vmcnt(1) 372; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 373; MUBUF-NEXT: s_waitcnt vmcnt(1) 374; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 375; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 376; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 377; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 378; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 379; MUBUF-NEXT: v_readlane_b32 s33, v40, 2 380; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 381; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload 382; MUBUF-NEXT: s_mov_b64 exec, s[4:5] 383; MUBUF-NEXT: s_waitcnt vmcnt(0) 384; MUBUF-NEXT: s_setpc_b64 s[30:31] 385; 386; FLATSCR-LABEL: func_caller_byval: 387; FLATSCR: ; %bb.0: 388; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 389; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 390; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill 391; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] 392; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off 393; FLATSCR-NEXT: v_writelane_b32 v40, s33, 2 394; FLATSCR-NEXT: s_mov_b32 s33, s32 395; FLATSCR-NEXT: s_add_i32 s32, s32, 16 396; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 397; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 398; FLATSCR-NEXT: s_getpc_b64 s[0:1] 399; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 400; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 401; FLATSCR-NEXT: s_waitcnt vmcnt(0) 402; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 403; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:8 404; FLATSCR-NEXT: s_waitcnt vmcnt(0) 405; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:8 406; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:16 407; FLATSCR-NEXT: s_waitcnt vmcnt(0) 408; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:16 409; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:24 410; FLATSCR-NEXT: s_waitcnt vmcnt(0) 411; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:24 412; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:32 413; FLATSCR-NEXT: s_waitcnt vmcnt(0) 414; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:32 415; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:40 416; FLATSCR-NEXT: s_waitcnt vmcnt(0) 417; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:40 418; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off offset:48 419; FLATSCR-NEXT: s_waitcnt vmcnt(0) 420; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 offset:48 421; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off offset:56 422; FLATSCR-NEXT: s_waitcnt vmcnt(0) 423; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56 424; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 425; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 426; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 427; FLATSCR-NEXT: s_add_i32 s32, s32, -16 428; FLATSCR-NEXT: v_readlane_b32 s33, v40, 2 429; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 430; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload 431; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] 432; FLATSCR-NEXT: s_waitcnt vmcnt(0) 433; FLATSCR-NEXT: s_setpc_b64 s[30:31] 434 %cast = bitcast [16 x i32] addrspace(5)* %argptr to i8 addrspace(5)* 435 call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %argptr) 436 ret void 437} 438 439declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #1 440 441attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 442attributes #1 = { argmemonly nofree nounwind willreturn writeonly } 443