1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
3
4; GCN-LABEL: {{^}}max_24regs_32a_used:
5; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
6; GCN-NOT:     s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
7; GCN-DAG:     v_mfma_f32_16x16x1f32
8; GCN-DAG:     v_mfma_f32_16x16x1f32
9; GCN-DAG:     v_accvgpr_read_b32
10; GCN-NOT:     buffer_store_dword
11; GCN-NOT:     buffer_load_dword
12; GFX908-NOT:  v_accvgpr_write_b32
13; GFX90A:      v_accvgpr_write_b32
14; GCN:         ScratchSize: 0
15define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
16bb:
17  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
18  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
19  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
20  %elt1 = extractelement <16 x float> %mai.2, i32 0
21  %elt2 = extractelement <16 x float> %mai.1, i32 15
22  %elt3 = extractelement <16 x float> %mai.1, i32 14
23  %elt4 = extractelement <16 x float> %mai.2, i32 1
24  store float %elt1, float addrspace(1)* %out
25  %gep1 = getelementptr float, float addrspace(1)* %out, i64 1
26  store float %elt2, float addrspace(1)* %gep1
27  %gep2 = getelementptr float, float addrspace(1)* %out, i64 2
28  store float %elt3, float addrspace(1)* %gep2
29  %gep3 = getelementptr float, float addrspace(1)* %out, i64 3
30  store float %elt4, float addrspace(1)* %gep3
31
32  ret void
33}
34
35; GCN-LABEL: {{^}}max_12regs_13a_used:
36; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
37; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
38; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
39; GCN-NOT: buffer_store_dword
40; GCN-NOT: buffer_load_dword
41; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
42; GCN:     ScratchSize: 0
43define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
44bb:
45  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
46  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
47  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
48  %cmp = icmp eq i32 %cond, 0
49  br i1 %cmp, label %use, label %st
50
51use:
52  call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
53  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
54  br label %st
55
56st:
57  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
58  %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
59  call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
60  ret void
61}
62
63; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
64; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
65; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
66; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
67; GCN-NOT: buffer_store_dword
68; GCN-NOT: buffer_load_dword
69; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
70; GCN:     ScratchSize: 0
71define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
72  %a1 = call <4 x i32> asm sideeffect "", "=a"()
73  %a2 = call <4 x i32> asm sideeffect "", "=a"()
74  %a3 = call i32 asm sideeffect "", "=a"()
75  %a4 = call <2 x i32> asm sideeffect "", "=a"()
76  call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3)
77  call void asm sideeffect "", "a"(<2 x i32> %a4)
78  ret void
79}
80
81; GCN-LABEL: {{^}}max_32regs_mfma32:
82; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
83; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
84; GCN-NOT: buffer_store_dword
85; GCN:     v_accvgpr_read_b32
86; GCN:     v_mfma_f32_32x32x1f32
87; GCN-NOT: buffer_load_dword
88; GCN:     v_accvgpr_write_b32
89; GCN:     ScratchSize: 0
90define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
91bb:
92  %v = call i32 asm sideeffect "", "=a"()
93  br label %use
94
95use:
96  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
97  call void asm sideeffect "", "a"(i32 %v)
98  %elt1 = extractelement <32 x float> %mai.1, i32 0
99  store float %elt1, float addrspace(1)* %arg
100  ret void
101}
102
103; Should spill agprs to memory for both gfx908 and gfx90a.
104; GCN-LABEL: {{^}}max_5regs_used_8a:
105; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
106; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
107
108; GFX908-DAG:  v_accvgpr_read_b32 v1, a0 ; Reload Reuse
109; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
110; GFX908-DAG:  v_accvgpr_read_b32 v1, a1 ; Reload Reuse
111; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
112; GFX908-DAG:  v_accvgpr_read_b32 v1, a2 ; Reload Reuse
113; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
114; GFX908-DAG:  v_accvgpr_read_b32 v1, a3 ; Reload Reuse
115; GFX908-DAG:  buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
116
117; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
118; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
119; GFX90A-DAG:  v_accvgpr_read_b32 v4, a2               ;  Reload Reuse
120; GFX90A-DAG:  v_accvgpr_read_b32 v3, a3               ;  Reload Reuse
121
122; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
123
124; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
125; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
126; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
127; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
128; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
129
130; GFX90A-DAG:  buffer_load_dword a0, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
131; GFX90A-DAG:  buffer_load_dword a1, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
132; GFX90A-DAG:  v_accvgpr_write_b32 a2, v4              ;  Reload Reuse
133; GFX90A-DAG:  v_accvgpr_write_b32 a3, v3              ;  Reload Reuse
134; GFX90A:  global_store_dwordx4 v[0:1], a[0:3], off
135
136; GCN: ScratchSize: 20
137define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 {
138  %tid = call i32 @llvm.amdgcn.workitem.id.x()
139  %v0 = call float asm sideeffect "; def $0", "=v"()
140  %a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
141  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
142  %mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep
143  %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0)
144  store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep
145  store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef
146  call void asm sideeffect "; use $0", "v"(float %v0);
147  ret void
148}
149
150declare i32 @llvm.amdgcn.workitem.id.x()
151declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
152declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
153declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
154
155attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
156attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
157attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
158attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
159attributes #4 = { nounwind "amdgpu-num-vgpr"="5" }
160