1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s 3 4; If spilling to smem, additional registers are used for the resource 5; descriptor. 6 7; ALL-LABEL: {{^}}max_12_sgprs: 8 9; FIXME: Should be ablo to skip this copying of the private segment 10; buffer because all the SGPR spills are to VGPRs. 11 12; ALL: s_mov_b64 s[10:11], s[2:3] 13; ALL: s_mov_b64 s[8:9], s[0:1] 14; ALL: SGPRBlocks: 1 15; ALL: NumSGPRsForWavesPerEU: 14 16define void @max_12_sgprs(i32 addrspace(1)* %out1, 17 18 i32 addrspace(1)* %out2, 19 i32 addrspace(1)* %out3, 20 i32 addrspace(1)* %out4, 21 i32 %one, i32 %two, i32 %three, i32 %four) #0 { 22 store i32 %one, i32 addrspace(1)* %out1 23 store i32 %two, i32 addrspace(1)* %out2 24 store i32 %three, i32 addrspace(1)* %out3 25 store i32 %four, i32 addrspace(1)* %out4 26 ret void 27} 28 29; private resource: 4 30; scratch wave offset: 1 31; workgroup ids: 3 32; dispatch id: 2 33; queue ptr: 2 34; flat scratch init: 2 35; --------------------- 36; total: 14 37 38; + reserved vcc = 16 39 40; Because we can't handle re-using the last few input registers as the 41; special vcc etc. registers (as well as decide to not use the unused 42; features when the number of registers is frozen), this ends up using 43; more than expected. 44 45; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: 46; TOSGPR: SGPRBlocks: 1 47; TOSGPR: NumSGPRsForWavesPerEU: 16 48 49; TOSMEM: s_mov_b64 s[10:11], s[2:3] 50; TOSMEM: s_mov_b64 s[8:9], s[0:1] 51; TOSMEM: s_mov_b32 s7, s13 52 53; TOSMEM: SGPRBlocks: 1 54; TOSMEM: NumSGPRsForWavesPerEU: 16 55define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, 56 i32 addrspace(1)* %out2, 57 i32 addrspace(1)* %out3, 58 i32 addrspace(1)* %out4, 59 i32 %one, i32 %two, i32 %three, i32 %four) #2 { 60 store volatile i32 0, i32* undef 61 %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 62 store volatile i32 %x.0, i32 addrspace(1)* undef 63 %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 64 store volatile i32 %x.0, i32 addrspace(1)* undef 65 %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 66 store volatile i32 %x.0, i32 addrspace(1)* undef 67 %x.3 = call i64 @llvm.amdgcn.dispatch.id() 68 store volatile i64 %x.3, i64 addrspace(1)* undef 69 %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() 70 store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef 71 %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() 72 store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef 73 74 store i32 %one, i32 addrspace(1)* %out1 75 store i32 %two, i32 addrspace(1)* %out2 76 store i32 %three, i32 addrspace(1)* %out3 77 store i32 %four, i32 addrspace(1)* %out4 78 ret void 79} 80 81; The following test is commented out for now; http://llvm.org/PR31230 82; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} 83; ; Make sure copies for input buffer are not clobbered. This requires 84; ; swapping the order the registers are copied from what normally 85; ; happens. 86 87; XTOSMEM: s_mov_b32 s5, s11 88; XTOSMEM: s_add_u32 m0, s5, 89; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0 90 91; XALL: SGPRBlocks: 2 92; XALL: NumSGPRsForWavesPerEU: 18 93;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1, 94; i32 addrspace(1)* %out2, 95; i32 addrspace(1)* %out3, 96; i32 addrspace(1)* %out4, 97; i32 %one, i32 %two, i32 %three, i32 %four) #2 { 98; store volatile i32 0, i32* undef 99; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() 100; store volatile i32 %x.0, i32 addrspace(1)* undef 101; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() 102; store volatile i32 %x.0, i32 addrspace(1)* undef 103; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() 104; store volatile i32 %x.0, i32 addrspace(1)* undef 105; %x.3 = call i64 @llvm.amdgcn.dispatch.id() 106; store volatile i64 %x.3, i64 addrspace(1)* undef 107; %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() 108; store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef 109; 110; store i32 %one, i32 addrspace(1)* %out1 111; store i32 %two, i32 addrspace(1)* %out2 112; store i32 %three, i32 addrspace(1)* %out3 113; store i32 %four, i32 addrspace(1)* %out4 114; ret void 115;} 116 117declare i32 @llvm.amdgcn.workgroup.id.x() #1 118declare i32 @llvm.amdgcn.workgroup.id.y() #1 119declare i32 @llvm.amdgcn.workgroup.id.z() #1 120declare i64 @llvm.amdgcn.dispatch.id() #1 121declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 122declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1 123 124attributes #0 = { nounwind "amdgpu-num-sgpr"="14" } 125attributes #1 = { nounwind readnone } 126attributes #2 = { nounwind "amdgpu-num-sgpr"="12" } 127attributes #3 = { nounwind "amdgpu-num-sgpr"="11" } 128