1// RUN: mlir-opt %s -test-scf-pipelining -split-input-file | FileCheck %s
2// RUN: mlir-opt %s -test-scf-pipelining=annotate -split-input-file | FileCheck %s --check-prefix ANNOTATE
3// RUN: mlir-opt %s -test-scf-pipelining=no-epilogue-peeling -split-input-file | FileCheck %s --check-prefix NOEPILOGUE
4
5// CHECK-LABEL: simple_pipeline(
6//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
7//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
8//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
9//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
10// Prologue:
11//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
12// Kernel:
13//  CHECK-NEXT:   %[[L1:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
14//  CHECK-SAME:     step %[[C1]] iter_args(%[[LARG:.*]] = %[[L0]]) -> (f32) {
15//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
16//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
17//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
18//  CHECK-NEXT:     %[[LR:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
19//  CHECK-NEXT:     scf.yield %[[LR]] : f32
20//  CHECK-NEXT:   }
21// Epilogue:
22//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]], %{{.*}} : f32
23//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C3]]] : memref<?xf32>
24func.func @simple_pipeline(%A: memref<?xf32>, %result: memref<?xf32>) {
25  %c0 = arith.constant 0 : index
26  %c1 = arith.constant 1 : index
27  %c4 = arith.constant 4 : index
28  %cf = arith.constant 1.0 : f32
29  scf.for %i0 = %c0 to %c4 step %c1 {
30    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
31    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
32    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : memref<?xf32>
33  }  { __test_pipelining_loop__ }
34  return
35}
36
37// -----
38
39// CHECK-LABEL: simple_pipeline_step(
40//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
41//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
42//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
43//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5 : index
44//   CHECK-DAG:   %[[C6:.*]] = arith.constant 6 : index
45//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
46// Prologue:
47//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
48//       CHECK:   %[[L1:.*]] = memref.load %[[A]][%[[C3]]] : memref<?xf32>
49// Kernel:
50//  CHECK-NEXT:   %[[L2:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C5]]
51//  CHECK-SAME:     step %[[C3]] iter_args(%[[LARG0:.*]] = %[[L0]], %[[LARG1:.*]] = %[[L1]]) -> (f32, f32) {
52//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG0]], %{{.*}} : f32
53//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
54//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C6]] : index
55//  CHECK-NEXT:     %[[LR:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
56//  CHECK-NEXT:     scf.yield %[[LARG1]], %[[LR]] : f32, f32
57//  CHECK-NEXT:   }
58// Epilogue:
59//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L2]]#0, %{{.*}} : f32
60//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C6]]] : memref<?xf32>
61//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[L2]]#1, %{{.*}} : f32
62//  CHECK-NEXT:   memref.store %[[ADD2]], %[[R]][%[[C9]]] : memref<?xf32>
63func.func @simple_pipeline_step(%A: memref<?xf32>, %result: memref<?xf32>) {
64  %c0 = arith.constant 0 : index
65  %c3 = arith.constant 3 : index
66  %c11 = arith.constant 11 : index
67  %cf = arith.constant 1.0 : f32
68  scf.for %i0 = %c0 to %c11 step %c3 {
69    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
70    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
71    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 1 } : memref<?xf32>
72  }  { __test_pipelining_loop__ }
73  return
74}
75
76// -----
77
78// CHECK-LABEL: three_stage(
79//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
80//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
81//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
82//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
83//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
84// Prologue:
85//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
86//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
87//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
88// Kernel:
89//  CHECK-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C2]]
90//  CHECK-SAME:     step %[[C1]] iter_args(%[[ADDARG:.*]] = %[[ADD0]],
91//  CHECK-SAME:     %[[LARG:.*]] = %[[L1]]) -> (f32, f32) {
92//  CHECK-NEXT:     memref.store %[[ADDARG]], %[[R]][%[[IV]]] : memref<?xf32>
93//  CHECK-NEXT:     %[[ADD1:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
94//  CHECK-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
95//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
96//  CHECK-NEXT:     scf.yield %[[ADD1]], %[[L3]] : f32, f32
97//  CHECK-NEXT:   }
98// Epilogue:
99//  CHECK-NEXT:   memref.store %[[LR]]#0, %[[R]][%[[C2]]] : memref<?xf32>
100//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
101//  CHECK-NEXT:   memref.store %[[ADD2]], %[[R]][%[[C3]]] : memref<?xf32>
102
103// Prologue:
104//  ANNOTATE:   memref.load {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "prologue"}
105//  ANNOTATE:   memref.load {{.*}} {__test_pipelining_iteration = 1 : i32, __test_pipelining_part = "prologue"}
106// Kernel:
107//  ANNOTATE:   scf.for
108//  ANNOTATE:     memref.store {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
109//  ANNOTATE:     arith.addf {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
110//  ANNOTATE:     memref.load {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "kernel"}
111//  ANNOTATE:     scf.yield
112//  ANNOTATE:   }
113// Epilogue:
114//  ANNOTATE:   memref.store {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "epilogue"}
115//  ANNOTATE:   arith.addf {{.*}} {__test_pipelining_iteration = 0 : i32, __test_pipelining_part = "epilogue"}
116//  ANNOTATE:   memref.store {{.*}} {__test_pipelining_iteration = 1 : i32, __test_pipelining_part = "epilogue"}
117
118// NOEPILOGUE-LABEL: three_stage(
119//  NOEPILOGUE-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
120//   NOEPILOGUE-DAG:   %[[C0:.*]] = arith.constant 0 : index
121//   NOEPILOGUE-DAG:   %[[C1:.*]] = arith.constant 1 : index
122//   NOEPILOGUE-DAG:   %[[C2:.*]] = arith.constant 2 : index
123//   NOEPILOGUE-DAG:   %[[C3:.*]] = arith.constant 3 : index
124//   NOEPILOGUE-DAG:   %[[C4:.*]] = arith.constant 4 : index
125//   NOEPILOGUE-DAG:   %[[CF:.*]] = arith.constant 0.000000e+00 : f32
126// Prologue:
127//       NOEPILOGUE:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
128//  NOEPILOGUE-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
129//  NOEPILOGUE-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
130// Kernel:
131//  NOEPILOGUE-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]]
132//  NOEPILOGUE-SAME:     step %[[C1]] iter_args(%[[ADDARG:.*]] = %[[ADD0]],
133//  NOEPILOGUE-SAME:     %[[LARG:.*]] = %[[L1]]) -> (f32, f32) {
134//   NOEPILOGUE-DAG:     %[[S0:.*]] = arith.cmpi slt, %[[IV]], %[[C2]] : index
135//   NOEPILOGUE-DAG:     %[[S1:.*]] = arith.cmpi slt, %[[IV]], %[[C3]] : index
136//  NOEPILOGUE-NEXT:     memref.store %[[ADDARG]], %[[R]][%[[IV]]] : memref<?xf32>
137//  NOEPILOGUE-NEXT:     %[[ADD1:.*]] = scf.if %[[S1]] -> (f32) {
138//  NOEPILOGUE-NEXT:       %[[PADD:.*]] = arith.addf %[[LARG]], %{{.*}} : f32
139//  NOEPILOGUE-NEXT:       scf.yield %[[PADD]] : f32
140//  NOEPILOGUE-NEXT:     } else {
141//  NOEPILOGUE-NEXT:       scf.yield %[[CF]] : f32
142//  NOEPILOGUE-NEXT:     }
143//  NOEPILOGUE-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
144//  NOEPILOGUE-NEXT:     %[[L3:.*]] = scf.if %[[S0]] -> (f32) {
145//  NOEPILOGUE-NEXT:       %[[PL:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
146//  NOEPILOGUE-NEXT:       scf.yield %[[PL]] : f32
147//  NOEPILOGUE-NEXT:     } else {
148//  NOEPILOGUE-NEXT:       scf.yield %[[CF]] : f32
149//  NOEPILOGUE-NEXT:     }
150//  NOEPILOGUE-NEXT:     scf.yield %[[ADD1]], %[[L3]] : f32, f32
151//  NOEPILOGUE-NEXT:   }
152// No epilogue should be generated.
153//   NOEPILOGUE-NOT:   memref.store
154//       NOEPILOGUE:   return
155
156func.func @three_stage(%A: memref<?xf32>, %result: memref<?xf32>) {
157  %c0 = arith.constant 0 : index
158  %c1 = arith.constant 1 : index
159  %c4 = arith.constant 4 : index
160  %cf = arith.constant 1.0 : f32
161  scf.for %i0 = %c0 to %c4 step %c1 {
162    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
163    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
164    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : memref<?xf32>
165  } { __test_pipelining_loop__ }
166  return
167}
168
169// -----
170// CHECK-LABEL: long_liverange(
171//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
172//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
173//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
174//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
175//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
176//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
177//   CHECK-DAG:   %[[C6:.*]] = arith.constant 6 : index
178//   CHECK-DAG:   %[[C7:.*]] = arith.constant 7 : index
179//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
180//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
181// Prologue:
182//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
183//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
184//  CHECK-NEXT:   %[[L2:.*]] = memref.load %[[A]][%[[C2]]] : memref<?xf32>
185//  CHECK-NEXT:   %[[L3:.*]] = memref.load %[[A]][%[[C3]]] : memref<?xf32>
186// Kernel:
187//  CHECK-NEXT:   %[[LR:.*]]:4 = scf.for %[[IV:.*]] = %[[C0]] to %[[C6]]
188//  CHECK-SAME:     step %[[C1]] iter_args(%[[LA0:.*]] = %[[L0]],
189//  CHECK-SAME:     %[[LA1:.*]] = %[[L1]], %[[LA2:.*]] = %[[L2]],
190//  CHECK-SAME:     %[[LA3:.*]] = %[[L3]]) -> (f32, f32, f32, f32) {
191//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LA0]], %{{.*}} : f32
192//  CHECK-NEXT:     memref.store %[[ADD0]], %[[R]][%[[IV]]] : memref<?xf32>
193//  CHECK-NEXT:     %[[IV4:.*]] = arith.addi %[[IV]], %[[C4]] : index
194//  CHECK-NEXT:     %[[L4:.*]] = memref.load %[[A]][%[[IV4]]] : memref<?xf32>
195//  CHECK-NEXT:     scf.yield %[[LA1]], %[[LA2]], %[[LA3]], %[[L4]] : f32, f32, f32, f32
196//  CHECK-NEXT:   }
197// Epilogue:
198//  CHECK-NEXT:  %[[ADD1:.*]] = arith.addf %[[LR]]#0, %{{.*}} : f32
199//  CHECK-NEXT:  memref.store %[[ADD1]], %[[R]][%[[C6]]] : memref<?xf32>
200//  CHECK-NEXT:  %[[ADD2:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
201//  CHECK-NEXT:  memref.store %[[ADD2]], %[[R]][%[[C7]]] : memref<?xf32>
202//  CHECK-NEXT:  %[[ADD3:.*]] = arith.addf %[[LR]]#2, %{{.*}} : f32
203//  CHECK-NEXT:  memref.store %[[ADD3]], %[[R]][%[[C8]]] : memref<?xf32>
204//  CHECK-NEXT:  %[[ADD4:.*]] = arith.addf %[[LR]]#3, %{{.*}} : f32
205//  CHECK-NEXT:  memref.store %[[ADD4]], %[[R]][%[[C9]]] : memref<?xf32>
206func.func @long_liverange(%A: memref<?xf32>, %result: memref<?xf32>) {
207  %c0 = arith.constant 0 : index
208  %c1 = arith.constant 1 : index
209  %c10 = arith.constant 10 : index
210  %cf = arith.constant 1.0 : f32
211  scf.for %i0 = %c0 to %c10 step %c1 {
212    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
213    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 4, __test_pipelining_op_order__ = 0 } : f32
214    memref.store %A1_elem, %result[%i0] { __test_pipelining_stage__ = 4, __test_pipelining_op_order__ = 1 } : memref<?xf32>
215  } { __test_pipelining_loop__ }
216  return
217}
218
219// -----
220
221// CHECK-LABEL: multiple_uses(
222//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
223//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
224//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
225//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
226//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
227//   CHECK-DAG:   %[[C7:.*]] = arith.constant 7 : index
228//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
229//   CHECK-DAG:   %[[C9:.*]] = arith.constant 9 : index
230// Prologue:
231//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
232//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %{{.*}} : f32
233//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
234//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[L1]], %{{.*}} : f32
235//  CHECK-NEXT:   %[[MUL0:.*]] = arith.mulf %[[ADD0]], %[[L0]] : f32
236//  CHECK-NEXT:   %[[L2:.*]] = memref.load %[[A]][%[[C2]]] : memref<?xf32>
237// Kernel:
238//  CHECK-NEXT:   %[[LR:.*]]:4 = scf.for %[[IV:.*]] = %[[C0]] to %[[C7]]
239//  CHECK-SAME:     step %[[C1]] iter_args(%[[LA1:.*]] = %[[L1]],
240//  CHECK-SAME:     %[[LA2:.*]] = %[[L2]], %[[ADDARG1:.*]] = %[[ADD1]],
241//  CHECK-SAME:     %[[MULARG0:.*]] = %[[MUL0]]) -> (f32, f32, f32, f32) {
242//  CHECK-NEXT:     %[[ADD2:.*]] = arith.addf %[[LA2]], %{{.*}} : f32
243//  CHECK-NEXT:     %[[MUL1:.*]] = arith.mulf %[[ADDARG1]], %[[LA1]] : f32
244//  CHECK-NEXT:     memref.store %[[MULARG0]], %[[R]][%[[IV]]] : memref<?xf32>
245//  CHECK-NEXT:     %[[IV3:.*]] = arith.addi %[[IV]], %[[C3]] : index
246//  CHECK-NEXT:     %[[L3:.*]] = memref.load %[[A]][%[[IV3]]] : memref<?xf32>
247//  CHECK-NEXT:     scf.yield %[[LA2]], %[[L3]], %[[ADD2]], %[[MUL1]] : f32, f32, f32, f32
248//  CHECK-NEXT:   }
249// Epilogue:
250//  CHECK-NEXT:   %[[ADD3:.*]] = arith.addf %[[LR]]#1, %{{.*}} : f32
251//  CHECK-NEXT:   %[[MUL2:.*]] = arith.mulf %[[LR]]#2, %[[LR]]#0 : f32
252//  CHECK-NEXT:   memref.store %[[LR]]#3, %[[R]][%[[C7]]] : memref<?xf32>
253//  CHECK-NEXT:   %[[MUL3:.*]] = arith.mulf %[[ADD3]], %[[LR]]#1 : f32
254//  CHECK-NEXT:   memref.store %[[MUL2]], %[[R]][%[[C8]]] : memref<?xf32>
255//  CHECK-NEXT:   memref.store %[[MUL3]], %[[R]][%[[C9]]] : memref<?xf32>
256func.func @multiple_uses(%A: memref<?xf32>, %result: memref<?xf32>) {
257  %c0 = arith.constant 0 : index
258  %c1 = arith.constant 1 : index
259  %c10 = arith.constant 10 : index
260  %cf = arith.constant 1.0 : f32
261  scf.for %i0 = %c0 to %c10 step %c1 {
262    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 3 } : memref<?xf32>
263    %A1_elem = arith.addf %A_elem, %cf { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
264    %A2_elem = arith.mulf %A1_elem, %A_elem { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 1 } : f32
265    memref.store %A2_elem, %result[%i0] { __test_pipelining_stage__ = 3, __test_pipelining_op_order__ = 2 } : memref<?xf32>
266  } { __test_pipelining_loop__ }
267  return
268}
269
270// -----
271
272// CHECK-LABEL: loop_carried(
273//  CHECK-SAME:   %[[A:.*]]: memref<?xf32>, %[[R:.*]]: memref<?xf32>) {
274//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
275//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
276//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
277//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 1.000000e+00 : f32
278// Prologue:
279//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
280// Kernel:
281//  CHECK-NEXT:   %[[LR:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
282//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
283//  CHECK-SAME:     %[[LARG:.*]] = %[[L0]]) -> (f32, f32) {
284//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %[[C]] : f32
285//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
286//  CHECK-NEXT:     %[[L1:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
287//  CHECK-NEXT:     scf.yield %[[ADD0]], %[[L1]] : f32, f32
288//  CHECK-NEXT:   }
289// Epilogue:
290//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[LR]]#1, %[[LR]]#0 : f32
291//  CHECK-NEXT:   memref.store %[[ADD1]], %[[R]][%[[C0]]] : memref<?xf32>
292func.func @loop_carried(%A: memref<?xf32>, %result: memref<?xf32>) {
293  %c0 = arith.constant 0 : index
294  %c1 = arith.constant 1 : index
295  %c4 = arith.constant 4 : index
296  %cf = arith.constant 1.0 : f32
297  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
298    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 1 } : memref<?xf32>
299    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
300    scf.yield %A1_elem : f32
301  }  { __test_pipelining_loop__ }
302  memref.store %r, %result[%c0] : memref<?xf32>
303  return
304}
305
306// -----
307
308// CHECK-LABEL: backedge_different_stage
309//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
310//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
311//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
312//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
313//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 1.000000e+00 : f32
314// Prologue:
315//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
316//  CHECK-NEXT:   %[[ADD0:.*]] = arith.addf %[[L0]], %[[CSTF]] : f32
317//  CHECK-NEXT:   %[[L1:.*]] = memref.load %[[A]][%[[C1]]] : memref<?xf32>
318// Kernel:
319//  CHECK-NEXT:   %[[R:.*]]:3 = scf.for %[[IV:.*]] = %[[C0]] to %[[C2]]
320//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
321//  CHECK-SAME:     %[[ADDARG:.*]] = %[[ADD0]], %[[LARG:.*]] = %[[L1]]) -> (f32, f32, f32) {
322//  CHECK-NEXT:     %[[ADD1:.*]] = arith.addf %[[LARG]], %[[ADDARG]] : f32
323//  CHECK-NEXT:     %[[IV2:.*]] = arith.addi %[[IV]], %[[C2]] : index
324//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV2]]] : memref<?xf32>
325//  CHECK-NEXT:     scf.yield %[[ADDARG]], %[[ADD1]], %[[L2]] : f32, f32, f32
326//  CHECK-NEXT:   }
327// Epilogue:
328//  CHECK-NEXT:   %[[ADD2:.*]] = arith.addf %[[R]]#2, %[[R]]#1 : f32
329//  CHECK-NEXT:   return %[[ADD2]] : f32
330func.func @backedge_different_stage(%A: memref<?xf32>) -> f32 {
331  %c0 = arith.constant 0 : index
332  %c1 = arith.constant 1 : index
333  %c4 = arith.constant 4 : index
334  %cf = arith.constant 1.0 : f32
335  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
336    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
337    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
338    %A2_elem = arith.mulf %cf, %A1_elem { __test_pipelining_stage__ = 2, __test_pipelining_op_order__ = 0 } : f32
339    scf.yield %A2_elem : f32
340  }  { __test_pipelining_loop__ }
341  return %r : f32
342}
343
344// -----
345
346// CHECK-LABEL: backedge_same_stage
347//  CHECK-SAME:   (%[[A:.*]]: memref<?xf32>) -> f32 {
348//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
349//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
350//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
351//   CHECK-DAG:   %[[CSTF:.*]] = arith.constant 1.000000e+00 : f32
352// Prologue:
353//       CHECK:   %[[L0:.*]] = memref.load %[[A]][%[[C0]]] : memref<?xf32>
354// Kernel:
355//  CHECK-NEXT:   %[[R:.*]]:2 = scf.for %[[IV:.*]] = %[[C0]] to %[[C3]]
356//  CHECK-SAME:     step %[[C1]] iter_args(%[[C:.*]] = %[[CSTF]],
357//  CHECK-SAME:     %[[LARG:.*]] = %[[L0]]) -> (f32, f32) {
358//  CHECK-NEXT:     %[[ADD0:.*]] = arith.addf %[[LARG]], %[[C]] : f32
359//  CHECK-NEXT:     %[[IV1:.*]] = arith.addi %[[IV]], %[[C1]] : index
360//  CHECK-NEXT:     %[[L2:.*]] = memref.load %[[A]][%[[IV1]]] : memref<?xf32>
361//  CHECK-NEXT:     scf.yield %[[ADD0]], %[[L2]] : f32, f32
362//  CHECK-NEXT:   }
363// Epilogue:
364//  CHECK-NEXT:   %[[ADD1:.*]] = arith.addf %[[R]]#1, %[[R]]#0 : f32
365//  CHECK-NEXT:   return %[[ADD1]] : f32
366func.func @backedge_same_stage(%A: memref<?xf32>) -> f32 {
367  %c0 = arith.constant 0 : index
368  %c1 = arith.constant 1 : index
369  %c4 = arith.constant 4 : index
370  %cf = arith.constant 1.0 : f32
371  %r = scf.for %i0 = %c0 to %c4 step %c1 iter_args(%arg0 = %cf) -> (f32) {
372    %A_elem = memref.load %A[%i0] { __test_pipelining_stage__ = 0, __test_pipelining_op_order__ = 2 } : memref<?xf32>
373    %A1_elem = arith.addf %A_elem, %arg0 { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 0 } : f32
374    %A2_elem = arith.mulf %cf, %A1_elem { __test_pipelining_stage__ = 1, __test_pipelining_op_order__ = 1 } : f32
375    scf.yield %A2_elem : f32
376  }  { __test_pipelining_loop__ }
377  return %r : f32
378}
379