1; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
2; RUN:   FileCheck %s
3
4; CHECK-LABEL: no_setprio:
5; CHECK-NOT:       s_setprio
6; CHECK:           ; return to shader part epilog
7define amdgpu_ps <2 x float> @no_setprio() {
8  ret <2 x float> <float 0.0, float 0.0>
9}
10
11; CHECK-LABEL: vmem_in_exit_block:
12; CHECK:           s_setprio 3
13; CHECK:           buffer_load_dwordx2
14; CHECK-NEXT:      s_setprio 0
15; CHECK:           ; return to shader part epilog
16define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
17  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
18  ret <2 x float> %v
19}
20
21; CHECK-LABEL: branch:
22; CHECK:           s_setprio 3
23; CHECK:           s_cbranch_scc0 [[A:.*]]
24; CHECK:       {{.*}}:  ; %b
25; CHECK:           buffer_load_dwordx2
26; CHECK-NEXT:      s_setprio 0
27; CHECK:           s_branch [[EXIT:.*]]
28; CHECK:       [[A]]:  ; %a
29; CHECK-NEXT:      s_setprio 0
30; CHECK:           s_branch [[EXIT]]
31; CHECK-NEXT:  [[EXIT]]:
32define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
33  %cond = icmp eq i32 %i, 0
34  br i1 %cond, label %a, label %b
35
36a:
37  ret <2 x float> <float 0.0, float 0.0>
38
39b:
40  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
41  ret <2 x float> %v
42}
43
44; CHECK-LABEL: setprio_follows_setprio:
45; CHECK:           s_setprio 3
46; CHECK:           buffer_load_dwordx2
47; CHECK:           s_cbranch_scc1 [[C:.*]]
48; CHECK:       {{.*}}:  ; %a
49; CHECK:           buffer_load_dwordx2
50; CHECK-NEXT:      s_setprio 0
51; CHECK:           s_cbranch_scc1 [[C]]
52; CHECK:       {{.*}}:  ; %b
53; CHECK-NOT:       s_setprio
54; CHECK:           s_branch [[EXIT:.*]]
55; CHECK:       [[C]]:  ; %c
56; CHECK-NEXT:      s_setprio 0
57; CHECK:           s_branch [[EXIT]]
58; CHECK:       [[EXIT]]:
59define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
60entry:
61  %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
62  %cond1 = icmp ne i32 %i, 0
63  br i1 %cond1, label %a, label %c
64
65a:
66  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
67  %cond2 = icmp ne i32 %i, 1
68  br i1 %cond2, label %b, label %c
69
70b:
71  ret <2 x float> %v2
72
73c:
74  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
75  %v4 = fadd <2 x float> %v1, %v3
76  ret <2 x float> %v4
77}
78
79; CHECK-LABEL: loop:
80; CHECK:       {{.*}}:  ; %entry
81; CHECK:           s_setprio 3
82; CHECK-NOT:       s_setprio
83; CHECK:       [[LOOP:.*]]:  ; %loop
84; CHECK-NOT:       s_setprio
85; CHECK:           buffer_load_dwordx2
86; CHECK-NOT:       s_setprio
87; CHECK:           s_cbranch_scc1 [[LOOP]]
88; CHECK-NEXT:  {{.*}}:  ; %exit
89; CHECK-NEXT:      s_setprio 0
90define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
91entry:
92  br label %loop
93
94loop:
95  %i = phi i32 [0, %entry], [%i2, %loop]
96  %sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
97
98  %i2 = add i32 %i, 1
99
100  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
101  %sum2 = fadd <2 x float> %sum, %v
102
103  %cond = icmp ult i32 %i2, 5
104  br i1 %cond, label %loop, label %exit
105
106exit:
107  ret <2 x float> %sum2
108}
109
110; CHECK-LABEL: edge_split:
111; CHECK:           s_setprio 3
112; CHECK:           buffer_load_dwordx2
113; CHECK-NOT:       s_setprio
114; CHECK:           s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
115; CHECK:       {{.*}}:  ; %loop.preheader
116; CHECK-NEXT:      s_setprio 0
117; CHECK:       [[LOOP:.*]]:  ; %loop
118; CHECK-NOT:       s_setprio
119; CHECK:           s_cbranch_scc1 [[LOOP]]
120; CHECK        {{.*}}:  ; %exit
121; CHECK-NOT:       s_setprio
122; CHECK:           s_branch [[RET:.*]]
123; CHECK:       [[ANOTHER_LOAD]]:  ; %another_load
124; CHECK:           buffer_load_dwordx2
125; CHECK-NEXT:      s_setprio 0
126; CHECK:           s_branch [[RET]]
127; CHECK:       [[RET]]:
128define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
129entry:
130  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
131  %cond = icmp ne i32 %x, 0
132  br i1 %cond, label %loop, label %another_load
133
134loop:
135  %i = phi i32 [0, %entry], [%i2, %loop]
136  %mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
137
138  %i2 = add i32 %i, 1
139  %mul2 = fmul <2 x float> %mul, %v
140
141  %cond2 = icmp ult i32 %i2, 5
142  br i1 %cond2, label %loop, label %exit
143
144exit:
145  ret <2 x float> %mul2
146
147another_load:
148  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
149  %sum = fadd <2 x float> %v, %v2
150  ret <2 x float> %sum
151}
152
153declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind
154