1// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs" -split-input-file | FileCheck %s
2
3// Run fuzzer with different seeds.
4// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=23" -split-input-file -o /dev/null
5// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=59" -split-input-file -o /dev/null
6// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=91" -split-input-file -o /dev/null
7
8// TODO: Extract op-specific test cases and move them to their respective
9// dialects.
10
11//===----------------------------------------------------------------------===//
12// Simple cases
13//===----------------------------------------------------------------------===//
14
15// -----
16
17// CHECK-LABEL: func @extract_slice_fun(
18func.func @extract_slice_fun(%A : tensor<?xf32> {bufferization.writable = false},
19//  CHECK-SAME:              bufferization.access = "read"
20                             %B : tensor<?xf32> {bufferization.writable = true})
21//  CHECK-SAME:              bufferization.access = "read"
22  -> (tensor<4xf32>, tensor<8xf32>)
23{
24  // tensor.extract_slice is not used in a write, it is not compelled to
25  // bufferize out of place. Let callers decide whether they want to create
26  // aliasing subviews at all call sites or whether they allocate.
27  // This is true irrespective of whether the function argument is inplaceable.
28  //     CHECK: tensor.extract_slice
29  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
30  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
31
32  //     CHECK: tensor.extract_slice
33  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
34  %r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
35
36  return %r0, %r1: tensor<4xf32>, tensor<8xf32>
37}
38
39// -----
40
41// CHECK-LABEL: func @insert_slice_fun(
42func.func @insert_slice_fun(%A : tensor<?xf32> {bufferization.writable = false},
43//  CHECK-SAME:             bufferization.access = "read"
44                            %B : tensor<?xf32> {bufferization.writable = true},
45//  CHECK-SAME:             bufferization.access = "read-write"
46                            %C : tensor<4xf32> {bufferization.writable = false})
47//  CHECK-SAME:             bufferization.access = "read"
48  -> (tensor<?xf32>, tensor<?xf32>)
49{
50  // must bufferize out of place.
51  //      CHECK: tensor.insert_slice
52  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false"]}
53  %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
54
55  // bufferizes inplace.
56  //      CHECK: tensor.insert_slice
57  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
58  %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
59
60  //      CHECK: return
61  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
62  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
63}
64
65// -----
66
67// CHECK-LABEL: func @conflict_on_B(
68func.func @conflict_on_B(%A : tensor<4x4xf32> {bufferization.writable = true},
69//  CHECK-SAME:          bufferization.access = "read"
70                         %B : tensor<4x4xf32> {bufferization.writable = true})
71//  CHECK-SAME:          bufferization.access = "read-write"
72  -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>)
73{
74  // matmul output operand interferes with input operand.
75  //     CHECK: linalg.matmul
76  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
77  %C = linalg.matmul  ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>)
78                     outs(%B: tensor<4x4xf32>)
79    -> tensor<4x4xf32>
80
81  // matmul output operand interferes with input operand.
82  //     CHECK: linalg.matmul
83  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
84  %D = linalg.matmul  ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>)
85                     outs(%B: tensor<4x4xf32>)
86    -> tensor<4x4xf32>
87
88  // matmul output operand does not interferes with input operand.
89  //     CHECK: linalg.matmul
90  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
91  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
92                     outs(%B: tensor<4x4xf32>)
93    -> tensor<4x4xf32>
94
95  //      CHECK: return
96  // CHECK-SAME: __equivalent_func_args__ = [-1, -1, 1]
97  return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>
98}
99
100//===----------------------------------------------------------------------===//
101// Length-1 producer-consumer cases.
102//===----------------------------------------------------------------------===//
103
104// -----
105
106// CHECK-LABEL: func @extract_slice_extract_slice(
107func.func @extract_slice_extract_slice(
108    %A : tensor<?xf32> {bufferization.writable = true},
109//  CHECK-SAME:         bufferization.access = "read"
110    %B : tensor<?xf32> {bufferization.writable = false})
111//  CHECK-SAME:         bufferization.access = "read"
112  -> (tensor<2xf32>, tensor<2xf32>)
113{
114  // tensor.extract_slice is not used in a write, it is not compelled to
115  // bufferize out of place. Let callers decide whether they want to create
116  // aliasing subviews at all call sites or whether they allocate.
117  // This is true irrespective of whether the function argument is inplaceable.
118  // CHECK: {__inplace_operands_attr__ = ["true"]}
119  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
120
121  // CHECK: {__inplace_operands_attr__ = ["true"]}
122  %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
123
124  // CHECK: {__inplace_operands_attr__ = ["true"]}
125  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
126
127  // CHECK: {__inplace_operands_attr__ = ["true"]}
128  %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
129
130  return %r1, %r3: tensor<2xf32>, tensor<2xf32>
131}
132
133// -----
134
135// CHECK-LABEL: func @insert_slice_insert_slice(
136func.func @insert_slice_insert_slice(
137    %A : tensor<?xf32> {bufferization.writable = true},
138//  CHECK-SAME:         bufferization.access = "read-write"
139    %A2 : tensor<4xf32> {bufferization.writable = true},
140//  CHECK-SAME:          bufferization.access = "read-write"
141    %A3 : tensor<2xf32> {bufferization.writable = true},
142//  CHECK-SAME:          bufferization.access = "read"
143    %B : tensor<?xf32> {bufferization.writable = false},
144//  CHECK-SAME:         bufferization.access = "read"
145    %B2 : tensor<4xf32> {bufferization.writable = false},
146//  CHECK-SAME:          bufferization.access = "read"
147    %B3 : tensor<2xf32> {bufferization.writable = false})
148//  CHECK-SAME:          bufferization.access = "read"
149  -> (tensor<?xf32>, tensor<?xf32>)
150{
151  // CHECK: {__inplace_operands_attr__ = ["true", "true"]}
152  %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
153
154  // CHECK: {__inplace_operands_attr__ = ["true", "true"]}
155  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
156
157  // CHECK: {__inplace_operands_attr__ = ["true", "false"]}
158  %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
159
160  // CHECK: {__inplace_operands_attr__ = ["true", "false"]}
161  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
162
163  //      CHECK: return
164  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
165  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
166}
167
168// -----
169
170// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
171func.func @extract_slice_nonmatching_insert_slice(
172    %A : tensor<?xf32> {bufferization.writable = true},
173    %B : tensor<?xf32> {bufferization.writable = false},
174    %idx: index)
175  -> (tensor<?xf32>, tensor<?xf32>)
176{
177  // %r1 bufferizes inplace because %A is inplaceable.
178  // %r0 is an overlapping tensor.extract_slice that does not match, it must be
179  // out of place.
180  //      CHECK: tensor.extract_slice
181  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
182  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
183
184  // %r1 can bufferize inplace fine.
185  //      CHECK: tensor.insert_slice
186  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]}
187  %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
188
189  // %r3 does bufferizes inplace because %B is not inplaceable.
190  // %r0 is an overlapping tensor.extract_slice that does not match, but does
191  // not alias with the buffer coming from %r3 so it can actually bufferize
192  // inplace.
193  //      CHECK: tensor.extract_slice
194  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
195  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
196
197  // %r3 cannot bufferize inplace since %B is not inplaceable.
198  //      CHECK: tensor.insert_slice
199  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]}
200  %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
201
202  //      CHECK: return
203  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
204  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
205}
206
207// -----
208
209// CHECK-LABEL: func @extract_slice_matching_insert_slice
210func.func @extract_slice_matching_insert_slice(
211    %A : tensor<?xf32> {bufferization.writable = true},
212    %B : tensor<?xf32> {bufferization.writable = false})
213  -> (tensor<?xf32>, tensor<?xf32>)
214{
215  // %r1 bufferizes inplace because %A is inplaceable.
216  // %r0 is a tensor.extract_slice that matches, it can also be bufferized
217  // inplace.
218  //      CHECK: tensor.extract_slice
219  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
220  %r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
221
222  //      CHECK: tensor.insert_slice
223  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
224  %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
225
226  // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
227  // inplace.
228  //      CHECK: tensor.extract_slice
229  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
230  %r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
231
232  // tensor.insert_slice cannot bufferize inplace.
233  // This should have been captured by a canonicalization pattern and it would
234  // be unproductive to have special logic in bufferization to encode matching
235  // insert_slice(extract_slice(A), A).
236  //      CHECK: tensor.insert_slice
237  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false"]}
238  %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
239
240  //      CHECK: return
241  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
242  return %r1, %r3: tensor<?xf32>, tensor<?xf32>
243}
244
245// -----
246
247// CHECK-LABEL: @read_of_matching_insert_slice_source
248func.func @read_of_matching_insert_slice_source(
249    %A : tensor<?xf32> {bufferization.writable = true},
250    %idx : index,
251    %idx2 : index)
252  -> (tensor<?xf32>, vector<5xf32>)
253{
254  %cst = arith.constant 0.0 : f32
255  %cst2 = arith.constant 1.0 : f32
256
257  //      CHECK: tensor.extract_slice
258  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
259  %0 = tensor.extract_slice %A[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
260
261  //      CHECK: linalg.fill
262  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
263  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
264
265  //      CHECK: tensor.insert_slice
266  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
267  %2 = tensor.insert_slice %1 into %A[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32>
268
269  %3 = vector.transfer_read %1[%idx2], %cst2 : tensor<?xf32>, vector<5xf32>
270
271  //      CHECK: return
272  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
273  return %2, %3 : tensor<?xf32>, vector<5xf32>
274}
275
276// -----
277
278// CHECK-LABEL: @read_of_matching_insert_slice_source_interleaved
279func.func @read_of_matching_insert_slice_source_interleaved(
280    %A : tensor<?xf32> {bufferization.writable = true},
281    %idx : index,
282    %idx2 : index,
283    %idx3 : index)
284  -> (tensor<?xf32>, vector<5xf32>)
285{
286  %cst = arith.constant 0.0 : f32
287  %cst2 = arith.constant 1.0 : f32
288
289  //      CHECK: tensor.extract_slice
290  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]}
291  %0 = tensor.extract_slice %A[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
292
293  //      CHECK: linalg.fill
294  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
295  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
296
297  //      CHECK: tensor.insert_slice
298  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
299  %2 = tensor.insert_slice %1 into %A[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32>
300
301  //      CHECK: tensor.extract_slice
302  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
303  %4 = tensor.extract_slice %2[%idx3][%idx3][1] : tensor<?xf32> to tensor<?xf32>
304
305  //      CHECK: linalg.fill
306  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
307  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?xf32>) -> tensor<?xf32>
308
309  %3 = vector.transfer_read %1[%idx2], %cst2 : tensor<?xf32>, vector<5xf32>
310
311  //      CHECK: tensor.insert_slice
312  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
313  %6 = tensor.insert_slice %5 into %2[%idx3][%idx3][1] : tensor<?xf32> into tensor<?xf32>
314
315  //      CHECK: return
316  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
317  return %6, %3 : tensor<?xf32>, vector<5xf32>
318}
319
320// -----
321
322// CHECK-LABEL: func @extract_slice_linalg_readonly_use
323func.func @extract_slice_linalg_readonly_use(
324    %A : tensor<?x?xf32> {bufferization.writable = false},
325    %B : tensor<4x4xf32> {bufferization.writable = false},
326    %C : tensor<4x4xf32> {bufferization.writable = true})
327  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
328{
329  // tensor.extract_slice is only used as a read, no interference irrespective
330  // of user's inplace status.
331  //     CHECK: tensor.extract_slice
332  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
333  %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
334
335  // matmul output operand is not inplaceable at the function boundary.
336  //     CHECK: linalg.matmul
337  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
338  %D = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
339                     outs(%B: tensor<4x4xf32>)
340    -> tensor<4x4xf32>
341
342  // matmul output operand is inplaceable at the function boundary.
343  //     CHECK: linalg.matmul
344  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
345  %E = linalg.matmul  ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>)
346                     outs(%C: tensor<4x4xf32>)
347    -> tensor<4x4xf32>
348
349  //      CHECK: return
350  // CHECK-SAME: __equivalent_func_args__ = [-1, 2]
351  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
352}
353
354// -----
355
356// CHECK-LABEL: func @extract_slice_to_linalg_write_use
357func.func @extract_slice_to_linalg_write_use(
358    %A : tensor<4x4xf32> {bufferization.writable = false},
359    %B : tensor<?x?xf32> {bufferization.writable = false},
360    %C : tensor<?x?xf32> {bufferization.writable = true})
361  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
362{
363  // Step 4. %sB forward propagates to a write in %D but it is not inplace.
364  // So this is only ever read and can bufferize inplace.
365  //     CHECK: tensor.extract_slice
366  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
367  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
368
369  // Step 3. %sB has a read interference in %E, it does not bufferize inplace.
370  //     CHECK: linalg.matmul
371  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]}
372  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
373                     outs(%sB: tensor<4x4xf32>)
374    -> tensor<4x4xf32>
375
376  // Step 2. %sC forward propagates to an inplace write in %E.
377  // %sC backward propagates to %C which is inplaceable.
378  // As a consequence this is bufferized inplace.
379  //     CHECK: tensor.extract_slice
380  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
381  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
382
383  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
384  // considered an interference. This bufferizes inplace.
385  //     CHECK: linalg.matmul
386  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
387  %E = linalg.matmul  ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
388                     outs(%sC: tensor<4x4xf32>)
389    -> tensor<4x4xf32>
390
391  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
392}
393
394// -----
395
396// CHECK-LABEL: func @insert_slice_double_extract_slice
397func.func @insert_slice_double_extract_slice(
398    %s1: index,
399    %s2: index,
400    %s3: index,
401    %s4: index,
402    %A: tensor<8x6xf32> {bufferization.writable = false},
403    %B: tensor<6x6xf32> {bufferization.writable = false},
404    %C: tensor<30x20xf32> {bufferization.writable = true})
405  -> tensor<30x20xf32>
406{
407  //      CHECK: tensor.extract_slice
408  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none", "none", "none"]}
409  %15 = tensor.extract_slice %C[%s3, %s4] [%s1, %s2] [1, 1] : tensor<30x20xf32> to tensor<?x?xf32>
410
411  //      CHECK: linalg.matmul
412  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
413  %18 = linalg.matmul ins(%A, %B : tensor<8x6xf32>, tensor<6x6xf32>) outs(%15 : tensor<?x?xf32>) -> tensor<?x?xf32>
414
415  //      CHECK: tensor.extract_slice
416  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
417  %19 = tensor.extract_slice %18[0, 0] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
418
419  //      CHECK: tensor.insert_slice
420  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none", "none", "none"]}
421  %20 = tensor.insert_slice %19 into %C[%s3, %s4] [%s1, %s2] [1, 1] : tensor<?x?xf32> into tensor<30x20xf32>
422
423  //      CHECK: return
424  // CHECK-SAME: __equivalent_func_args__ = [6]
425  return %20 : tensor<30x20xf32>
426}
427
428//===----------------------------------------------------------------------===//
429// Transitive cases
430//===----------------------------------------------------------------------===//
431
432// -----
433
434// CHECK-LABEL: func @extract_slice_to_linalg_write_use
435func.func @extract_slice_to_linalg_write_use(
436    %A : tensor<4x4xf32> {bufferization.writable = false},
437    %B : tensor<?x?xf32> {bufferization.writable = false},
438    %C : tensor<?x?xf32> {bufferization.writable = true})
439  ->  (tensor<4x4xf32>, tensor<4x4xf32>)
440{
441  // Step 4. %sB forward propagates to an inplace write in %D.
442  // %sB backward propagates to %B which is not inplaceable.
443  // As a consequence this is bufferized out of place.
444  //     CHECK: tensor.extract_slice
445  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
446  %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
447
448  // Step 3. %sB backprops to the tensor.extract_slice producer which is not
449  // considered an interference. This bufferizes inplace.
450  //     CHECK: linalg.matmul
451  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
452  %D = linalg.matmul  ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
453                     outs(%sB: tensor<4x4xf32>)
454    -> tensor<4x4xf32>
455
456  // Step 2. %sC forward propagates to an inplace write in %E.
457  // %sC backward propagates to %C which is inplaceable.
458  // As a consequence this is bufferized inplace.
459  //     CHECK: tensor.extract_slice
460  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
461  %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
462
463  // Step 1. %sC backprops to the tensor.extract_slice producer which is not
464  // considered an interference. This bufferizes inplace.
465  //     CHECK: linalg.matmul
466  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
467  %E = linalg.matmul  ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
468                     outs(%sC: tensor<4x4xf32>)
469    -> tensor<4x4xf32>
470
471  return %D, %E: tensor<4x4xf32>, tensor<4x4xf32>
472}
473
474// -----
475
476// CHECK-LABEL: func @nested_extract_slice_and_insert
477func.func @nested_extract_slice_and_insert(
478    %A : tensor<?x?xf32> {bufferization.writable = false},
479    %B : tensor<?x?xf32> {bufferization.writable = true},
480    %C : tensor<?x?xf32> {bufferization.writable = true},
481    %idx : index,
482    %sz1 : index,
483    %sz2 : index)
484  ->  (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)
485{
486  %f0 = arith.constant 0.0 : f32
487
488  // 2-level matching tensor.extract_slice / tensor.insert_slice into non
489  // inplaceable %A.
490  //   - %rA is not inplaceable because %A is not inplaceable at function boundary.
491  //   - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
492  //   - this propagates to %FA and %ssA being inplaceable.
493  //   - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
494  //     inplaceable and so %sA is not inplaceable.
495  //     CHECK: tensor.extract_slice
496  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]}
497  // CHECK-NEXT: tensor.extract_slice
498  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
499  // CHECK-NEXT: fill
500  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
501  // CHECK-NEXT: tensor.insert_slice
502  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
503  // CHECK-NEXT: tensor.insert_slice
504  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"]}
505  %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
506  %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
507  %FA = linalg.fill ins(%f0 : f32) outs(%ssA : tensor<4x4xf32>) -> tensor<4x4xf32>
508  %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
509  %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
510
511  // 3-level matching tensor.extract_slice / tensor.insert_slice into
512  // inplaceable %B.
513  // CHECK-NEXT: tensor.extract_slice
514  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
515  // CHECK-NEXT: tensor.extract_slice
516  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
517  // CHECK-NEXT: tensor.extract_slice
518  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
519  // CHECK-NEXT: fill
520  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
521  // CHECK-NEXT: tensor.insert_slice
522  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
523  // CHECK-NEXT: tensor.insert_slice
524  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]}
525  // CHECK-NEXT: tensor.insert_slice
526  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
527  %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
528  %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
529  %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
530  %FB = linalg.fill ins(%f0 : f32) outs(%sssB : tensor<4x4xf32>) -> tensor<4x4xf32>
531  %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
532  %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
533  %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
534
535  // 2-level matching tensor.extract_slice / tensor.insert_slice into
536  // inplaceable %C with a twist.
537  // Throw a wrench in the system: %rsC production sizes do not match %ssC.
538  // CHECK-NEXT: tensor.extract_slice
539  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
540  // The tensor.insert_slice that would be candidate for matching does not actually
541  // match. That tensor.insert_slice can still be bufferized inplace nonetheless
542  // but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
543  // CHECK-NEXT: tensor.extract_slice
544  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"]}
545  // CHECK-NEXT: fill
546  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
547  // CHECK-NEXT: tensor.insert_slice
548  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]}
549  // CHECK-NEXT: tensor.insert_slice
550  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
551  %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
552  %ssC = tensor.extract_slice %sC[0, 0][%sz1, 4][1, 1] : tensor<?x?xf32> to tensor<?x4xf32>
553  %FC = linalg.fill ins(%f0 : f32) outs(%ssC : tensor<?x4xf32>) -> tensor<?x4xf32>
554  %rsC = tensor.insert_slice %FC into %sC[0, 0][%sz2, 4][1, 1] : tensor<?x4xf32> into tensor<?x?xf32>
555  %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
556
557  //      CHECK: return
558  // CHECK-SAME: __equivalent_func_args__ = [-1, 1, 2]
559  return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
560}
561
562// -----
563
564//===----------------------------------------------------------------------===//
565// Cross function boundary cases.
566//===----------------------------------------------------------------------===//
567
568func.func private @foo(tensor<64xf32>)
569
570// CHECK-LABEL: dependence_through_call
571func.func @dependence_through_call(%I : tensor<64xf32> {bufferization.writable = true}) {
572  %f1 = arith.constant 1.000000e+00 : f32
573  %f2 = arith.constant 2.000000e+00 : f32
574
575  // 2. %B already bufferizes inplace, %A would alias and have a different
576  // value. The calls to `foo` are determined to read conservatively, so %A
577  // cannot bufferize inplace.
578  //     CHECK: fill
579  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
580  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
581
582  // 1. Bufferizes inplace: no alias to %A is yet possible.
583  //     CHECK: fill
584  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
585  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
586
587  call @foo(%A) : (tensor<64xf32>) -> ()
588  call @foo(%B) : (tensor<64xf32>) -> ()
589
590  return
591}
592
593// -----
594
595func.func private @foo(tensor<64xf32>)
596
597func.func private @bar(%A : tensor<64xf32>) {
598  call @foo(%A) : (tensor<64xf32>) -> ()
599  return
600}
601
602func.func @read_dependence_through_scf_and_call(
603    %I : tensor<64xf32> {bufferization.writable = true},
604    %I2 : tensor<64xf32> {bufferization.writable = true}) {
605  %c0 = arith.constant 0 : index
606  %c1 = arith.constant 1 : index
607  %c10 = arith.constant 10 : index
608  %f1 = arith.constant 1.000000e+00 : f32
609  %f2 = arith.constant 2.000000e+00 : f32
610
611  // 5. %B bufferizes inplace, %A would alias and have a different value.
612  // The calls to `foo` are determined to read conservatively, so %A cannot
613  // bufferize inplace.
614  //     CHECK: fill
615  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
616  %A = linalg.fill ins(%f1 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
617
618  // 4. Bufferizes inplace: no alias to %A is yet possible.
619  //     CHECK: fill
620  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
621  %B = linalg.fill ins(%f2 : f32) outs(%I : tensor<64xf32>) -> tensor<64xf32>
622
623  // 3. Does not read or write, bufferizes inplace.
624  //      CHECK: scf.for
625  // CHECK-NEXT: scf.yield
626  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
627  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true", "true"]}
628  %r:2 = scf.for %i = %c0 to %c10 step %c1 iter_args(%0 = %A, %1 = %B)
629    -> (tensor<64xf32>, tensor<64xf32>)
630  {
631    scf.yield %0, %1 : tensor<64xf32>, tensor<64xf32>
632  }
633  call @foo(%r#0) : (tensor<64xf32>) -> ()
634  call @foo(%r#1) : (tensor<64xf32>) -> ()
635
636  // 2. %B2 already bufferizes inplace, %A2 would alias and have a different
637  // value. The calls to `foo` are determined to read conservatively, so %A2
638  // cannot bufferize inplace.
639  //     CHECK: fill
640  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
641  %A2 = linalg.fill ins(%f1 : f32) outs(%I2 : tensor<64xf32>) -> tensor<64xf32>
642
643  // 1. Bufferizes inplace: no alias to %A2 is yet possible.
644  //     CHECK: fill
645  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
646  %B2 = linalg.fill ins(%f2 : f32) outs(%I2 : tensor<64xf32>) -> tensor<64xf32>
647
648  call @bar(%A2) : (tensor<64xf32>) -> ()
649  call @bar(%B2) : (tensor<64xf32>) -> ()
650  return
651}
652
653// -----
654
655//===----------------------------------------------------------------------===//
656// Transitive cases through extract_slice.
657//===----------------------------------------------------------------------===//
658
659// CHECK-LABEL: func @write_into_constant_via_alias
660func.func @write_into_constant_via_alias(%v : vector<5xi32>,
661                                    %s1 : index, %s2 : index,
662                                    %s3 : index) -> tensor<?xi32> {
663  %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
664  //      CHECK: tensor.extract_slice
665  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]}
666  %b = tensor.extract_slice %A[%s1][%s2][1] : tensor<4xi32> to tensor<?xi32>
667  //      CHECK: vector.transfer_write
668  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]}
669  %r = vector.transfer_write %v, %b[%s3] : vector<5xi32>, tensor<?xi32>
670  return %r : tensor<?xi32>
671}
672
673// -----
674
675func.func @matmul_on_tensors(
676    %arg0: tensor<518x518xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
677    %arg1: tensor<518x518xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
678    %arg2: tensor<256x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
679    -> tensor<256x256xf32>
680{
681  %c0 = arith.constant 0 : index
682  %cst_0 = arith.constant 0.000000e+00 : f32
683  %cst_1 = arith.constant 1.000000e+00 : f32
684
685  %7 = bufferization.alloc_tensor() : tensor<256x256xf32>
686
687  //      CHECK: linalg.fill
688  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
689  //      CHECK: linalg.fill
690  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
691  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
692  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
693
694  //      CHECK: tensor.extract_slice
695  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
696  //      CHECK: tensor.extract_slice
697  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
698  //      CHECK: linalg.matmul
699  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
700  %sA = tensor.extract_slice %8[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32>
701  %sB = tensor.extract_slice %11[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
702  %r = linalg.matmul
703         ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
704        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
705
706  //      CHECK: return
707  // CHECK-SAME: __equivalent_func_args__ = [2]
708  return %r : tensor<256x256xf32>
709}
710
711// -----
712
713func.func @matmul_on_tensors(
714    %arg0: tensor<518x518xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
715    %arg1: tensor<518x518xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
716    %arg2: tensor<256x256xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
717    -> tensor<256x256xf32>
718{
719  %c0 = arith.constant 0 : index
720  %cst_0 = arith.constant 0.000000e+00 : f32
721  %cst_1 = arith.constant 1.000000e+00 : f32
722
723  %7 = bufferization.alloc_tensor() : tensor<256x256xf32>
724
725  //     CHECK: linalg.fill
726  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false"]}
727  //      CHECK: vector.transfer_write
728  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
729  %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
730  %9 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
731  %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
732
733  //      CHECK: linalg.fill
734  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
735  //      CHECK: vector.transfer_write
736  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
737  %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<256x256xf32>) -> tensor<256x256xf32>
738  %12 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32>
739  %13 = vector.transfer_write %12, %11[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32>
740
741  //      CHECK: tensor.extract_slice
742  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
743  //      CHECK: tensor.extract_slice
744  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
745  //      CHECK: linalg.matmul
746  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
747  %sA = tensor.extract_slice %10[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32>
748  %sB = tensor.extract_slice %13[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32>
749  %r = linalg.matmul
750         ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>)
751        outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32>
752
753  //      CHECK: return
754  // CHECK-SAME: __equivalent_func_args__ = [2]
755  return %r : tensor<256x256xf32>
756}
757
758// -----
759
760//===----------------------------------------------------------------------===//
761// Chain of tensor.insert_slice is better traversed in reverse order without
762// prioritizing  the tensor.insert_slice ops.
763//===----------------------------------------------------------------------===//
764
765// CHECK-LABEL: func @insert_slice_chain(
766func.func @insert_slice_chain(
767    %v1: vector<32x90xf32>,
768    %v2: vector<30x90xf32>,
769    %arg0: tensor<62x126xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
770// CHECK-SAME: bufferization.access = "none"
771    %arg1: tensor<126x90xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = false},
772// CHECK-SAME: bufferization.access = "none"
773    %arg2: tensor<62x90xf32> {bufferization.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, bufferization.writable = true})
774// CHECK-SAME: bufferization.access = "write"
775  -> tensor<62x90xf32> attributes {passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
776{
777  %c0 = arith.constant 0 : index
778  %cst = arith.constant 0.000000e+00 : f32
779
780  //      CHECK: linalg.fill
781  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
782  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<62x90xf32>) -> tensor<62x90xf32>
783
784  //      CHECK: tensor.extract_slice
785  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
786  %2 = tensor.extract_slice %0[0, 0] [32, 90] [1, 1] : tensor<62x90xf32> to tensor<32x90xf32>
787  //      CHECK: vector.transfer_write
788  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
789  %7 = vector.transfer_write %v1, %2[%c0, %c0] {in_bounds = [true, true]} : vector<32x90xf32>, tensor<32x90xf32>
790  //      CHECK: tensor.insert_slice
791  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
792  %8 = tensor.insert_slice %7 into %0[0, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
793
794  //      CHECK: tensor.extract_slice
795  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
796  %10 = tensor.extract_slice %8[32, 0] [30, 90] [1, 1] : tensor<62x90xf32> to tensor<30x90xf32>
797  //      CHECK: vector.transfer_write
798  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
799  %14 = vector.transfer_write %v2, %10[%c0, %c0] {in_bounds = [true, true]} : vector<30x90xf32>, tensor<30x90xf32>
800  //      CHECK: tensor.insert_slice
801  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
802  %15 = tensor.insert_slice %14 into %8[32, 0] [30, 90] [1, 1] : tensor<30x90xf32> into tensor<62x90xf32>
803
804  //      CHECK: return
805  // CHECK-SAME: __equivalent_func_args__ = [4]
806  return %15 : tensor<62x90xf32>
807}
808
809// -----
810
811//===----------------------------------------------------------------------===//
812// Insert point issue cases.
813//===----------------------------------------------------------------------===//
814
815// Only test IR validity wrt dominance.
816// CHECK-LABEL: func @ip
817func.func @ip(%t: tensor<10x20xf32> {bufferization.writable = true},
818         %x: index, %y: index, %v: vector<5x6xf32>)
819  -> tensor<10x20xf32>
820{
821  %c0 = arith.constant 0 : index
822  %c256 = arith.constant 256 : index
823  %c257 = arith.constant 257 : index
824  %r = scf.for %arg0 = %c0 to %c257 step %c256 iter_args(%arg1 = %t) -> (tensor<10x20xf32>) {
825    %t1 = tensor.extract_slice %arg1[%x, 0] [5, %y] [1, 1] : tensor<10x20xf32> to tensor<5x?xf32>
826    %t11 = tensor.extract_slice %t1[0, 0] [5, %y] [1, 1] : tensor<5x?xf32> to tensor<5x?xf32>
827    %t2 = vector.transfer_write %v, %t11[%c0, %c0] : vector<5x6xf32>, tensor<5x?xf32>
828    %t3 = tensor.insert_slice %t2 into %arg1[%x, 0] [5, %y] [1, 1] : tensor<5x?xf32> into tensor<10x20xf32>
829    scf.yield %t3 : tensor<10x20xf32>
830  }
831
832  //      CHECK: return
833  // CHECK-SAME: __equivalent_func_args__ = [0]
834 return %r : tensor<10x20xf32>
835}
836
837// -----
838
839#accesses = [
840  affine_map<(i) -> (i)>,
841  affine_map<(i) -> (i)>,
842  affine_map<(i) -> (i)>
843]
844#trait = {
845  indexing_maps = #accesses,
846  iterator_types = ["parallel"]
847}
848
849// CHECK-LABEL: func @linalg_op_same_out_tensors(
850func.func @linalg_op_same_out_tensors(
851    %t1: tensor<?xf32> {bufferization.writable = true},
852// CHECK-SAME:          bufferization.access = "read"
853    %t2: tensor<?xf32> {bufferization.writable = true})
854// CHECK-SAME:          bufferization.access = "write"
855  -> (tensor<?xf32>, tensor<?xf32>){
856
857  //      CHECK: linalg.generic
858  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"]
859  %o:2 = linalg.generic #trait ins(%t1 : tensor<?xf32>)
860                               outs (%t2, %t2 : tensor<?xf32>, tensor<?xf32>) {
861      ^bb(%0: f32, %1: f32, %2 : f32) :
862        linalg.yield %0, %0 : f32, f32
863    } -> (tensor<?xf32>, tensor<?xf32>)
864
865  //      CHECK: return
866  // CHECK-SAME: __equivalent_func_args__ = [1, -1]
867  return %o#0, %o#1 : tensor<?xf32>, tensor<?xf32>
868}
869
870// -----
871
872#accesses = [
873  affine_map<(i) -> (i)>,
874  affine_map<(i) -> (i)>,
875  affine_map<(i) -> (i)>,
876  affine_map<(i) -> (i)>
877]
878#trait = {
879  indexing_maps = #accesses,
880  iterator_types = ["parallel"]
881}
882
883// CHECK-LABEL: func @linalg_op_same_out_tensors_2(
884func.func @linalg_op_same_out_tensors_2(
885    %t1: tensor<?xf32> {bufferization.writable = true},
886// CHECK-SAME:          bufferization.access = "read"
887    %t2: tensor<?xf32> {bufferization.writable = true})
888// CHECK-SAME:          bufferization.access = "write"
889        -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>){
890
891  //      CHECK: linalg.generic
892  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false", "false"]
893  %o:3 = linalg.generic #trait
894          ins(%t1 : tensor<?xf32>)
895          outs (%t2, %t2, %t2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) {
896      ^bb(%0: f32, %1: f32, %2 : f32, %3 : f32) :
897        linalg.yield %0, %0, %0 : f32, f32, f32
898    } -> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
899
900  //      CHECK: return
901  // CHECK-SAME: __equivalent_func_args__ = [1, -1, -1]
902  return %o#0, %o#1, %o#2 : tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
903}
904
905// -----
906
907// CHECK-LABEL: func @double_insert_slice_into_alias
908func.func @double_insert_slice_into_alias(
909    %v1: vector<32x90xf32>,
910    %v2: vector<30x90xf32>,
911    %arg2: tensor<62x90xf32> {bufferization.writable = true},
912    %s1: index, %s2: index, %s3: index, %s4: index)
913  -> (tensor<62x90xf32>, tensor<?x?xf32>)
914{
915  %c0 = arith.constant 0 : index
916
917  // Cannot bufferize inplace this extract_slice because both operand and result
918  // are modified and returned separately.
919  //      CHECK: tensor.extract_slice
920  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none", "none", "none"]
921  %e = tensor.extract_slice %arg2[%s1, %s2][%s3, %s4][1, 1] : tensor<62x90xf32> to tensor<?x?xf32>
922
923  //      CHECK: tensor.extract_slice
924  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
925  %2 = tensor.extract_slice %arg2[0, 0] [32, 90] [1, 1] : tensor<62x90xf32> to tensor<32x90xf32>
926  //      CHECK: vector.transfer_write
927  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
928  %7 = vector.transfer_write %v1, %2[%c0, %c0] {in_bounds = [true, true]} : vector<32x90xf32>, tensor<32x90xf32>
929  //      CHECK: tensor.insert_slice
930  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
931  %8 = tensor.insert_slice %7 into %arg2[0, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
932
933  //      CHECK: tensor.extract_slice
934  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
935  %10 = tensor.extract_slice %e[32, 0] [30, 90] [1, 1] : tensor<?x?xf32> to tensor<30x90xf32>
936  //      CHECK: vector.transfer_write
937  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none", "none"]
938  %14 = vector.transfer_write %v2, %10[%c0, %c0] {in_bounds = [true, true]} : vector<30x90xf32>, tensor<30x90xf32>
939  //      CHECK: tensor.insert_slice
940  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
941  %15 = tensor.insert_slice %14 into %e[32, 0] [30, 90] [1, 1] : tensor<30x90xf32> into tensor<?x?xf32>
942
943  //      CHECK: return
944  // CHECK-SAME: __equivalent_func_args__ = [2, -1]
945  return %8, %15 : tensor<62x90xf32>, tensor<?x?xf32>
946}
947
948// -----
949
950// CHECK-LABEL: func @interleaved_extract_insert_slice_chain_1
951func.func @interleaved_extract_insert_slice_chain_1(
952    %arg2: tensor<62x90xf32> {bufferization.writable = true})
953  -> (tensor<62x90xf32>)
954{
955  //      CHECK: tensor.extract_slice
956  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
957  %2 = tensor.extract_slice %arg2[0, 0] [32, 90] [1, 1] : tensor<62x90xf32> to tensor<32x90xf32>
958
959  // TODO: This should bufferize inplace once we have a proper range analysis.
960  //      CHECK: tensor.extract_slice
961  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]
962  %10 = tensor.extract_slice %arg2[32, 0] [30, 90] [1, 1] : tensor<62x90xf32> to tensor<30x90xf32>
963
964
965  //      CHECK: tensor.insert_slice
966  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
967  %8 = tensor.insert_slice %2 into %arg2[0, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
968
969
970  //      CHECK: tensor.insert_slice
971  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
972  %15 = tensor.insert_slice %10 into %8[32, 0] [30, 90] [1, 1] : tensor<30x90xf32> into tensor<62x90xf32>
973
974  //      CHECK: return
975  // CHECK-SAME: __equivalent_func_args__ = [0]
976  return %15 : tensor<62x90xf32>
977}
978
979// -----
980
981// CHECK-LABEL: func @interleaved_extract_insert_slice_chain_2
982func.func @interleaved_extract_insert_slice_chain_2(
983    %arg2: tensor<62x90xf32> {bufferization.writable = true})
984  -> (tensor<62x90xf32>)
985{
986  //      CHECK: tensor.extract_slice
987  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]
988  %2 = tensor.extract_slice %arg2[0, 0] [32, 90] [1, 1] : tensor<62x90xf32> to tensor<32x90xf32>
989
990  // The slices are overlapping, so this can never bufferize inplace.
991  //      CHECK: tensor.extract_slice
992  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]
993  %10 = tensor.extract_slice %arg2[31, 0] [30, 90] [1, 1] : tensor<62x90xf32> to tensor<30x90xf32>
994
995
996  //      CHECK: tensor.insert_slice
997  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
998  %8 = tensor.insert_slice %2 into %arg2[0, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
999
1000
1001  //      CHECK: tensor.insert_slice
1002  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
1003  %15 = tensor.insert_slice %10 into %8[31, 0] [30, 90] [1, 1] : tensor<30x90xf32> into tensor<62x90xf32>
1004
1005  //      CHECK: return
1006  // CHECK-SAME: __equivalent_func_args__ = [0]
1007  return %15 : tensor<62x90xf32>
1008}
1009
1010// -----
1011
1012// CHECK-LABEL: func @extract_once_insert_twice
1013func.func @extract_once_insert_twice(
1014    %arg2: tensor<62x90xf32> {bufferization.writable = true})
1015  -> (tensor<62x90xf32>)
1016{
1017  //      CHECK: tensor.extract_slice
1018  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]
1019  %2 = tensor.extract_slice %arg2[0, 0] [32, 90] [1, 1] : tensor<62x90xf32> to tensor<32x90xf32>
1020
1021  //      CHECK: tensor.insert_slice
1022  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
1023  %8 = tensor.insert_slice %2 into %arg2[0, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
1024
1025  //      CHECK: tensor.insert_slice
1026  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
1027  %15 = tensor.insert_slice %2 into %8[15, 0] [32, 90] [1, 1] : tensor<32x90xf32> into tensor<62x90xf32>
1028
1029  //      CHECK: return
1030  // CHECK-SAME: __equivalent_func_args__ = [0]
1031  return %15 : tensor<62x90xf32>
1032}
1033
1034// -----
1035
1036// CHECK-LABEL: func @some_use
1037func.func @some_use(%A : tensor<?xf32> {bufferization.writable = true},
1038                    %v : vector<5xf32>) -> (tensor<?xf32>) {
1039  %idx = arith.constant 0 : index
1040  //      CHECK: vector.transfer_write
1041  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
1042  %0 = vector.transfer_write %v, %A[%idx] : vector<5xf32>, tensor<?xf32>
1043  return %0 : tensor<?xf32>
1044}
1045
1046
1047// CHECK-LABEL: func @main_func
1048func.func @main_func(%A : tensor<?xf32> {bufferization.writable = true},
1049                     %v : vector<5xf32>) -> (tensor<?xf32>) {
1050  //      CHECK: call
1051  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
1052  %0 = call @some_use(%A, %v) : (tensor<?xf32>, vector<5xf32>) -> (tensor<?xf32>)
1053  return %0 : tensor<?xf32>
1054}
1055
1056// -----
1057
1058// CHECK-LABEL: func @to_tensor_op_not_writable
1059func.func @to_tensor_op_not_writable(%m: memref<?xf32>, %v:  vector<5xf32>,
1060                                %idx1: index, %idx2: index)
1061    -> vector<10xf32> {
1062  %0 = bufferization.to_tensor %m : memref<?xf32>
1063
1064  // Write to the tensor. Cannot be inplace due to tensor_load.
1065  //      CHECK: vector.transfer_write
1066  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
1067  %w = vector.transfer_write %v, %0[%idx1] : vector<5xf32>, tensor<?xf32>
1068
1069  // Read from the tensor and return result.
1070  %cst = arith.constant 0.0 : f32
1071  %r = vector.transfer_read %w[%idx2], %cst : tensor<?xf32>, vector<10xf32>
1072  return %r : vector<10xf32>
1073}
1074
1075// -----
1076
1077// CHECK-LABEL: func @to_memref_op_is_reading
1078func.func @to_memref_op_is_reading(%t1: tensor<?xf32> {bufferization.writable = true},
1079                                   %idx1: index, %idx2: index, %idx3: index,
1080                                   %v1: vector<5xf32>)
1081    -> (vector<5xf32>, vector<5xf32>) {
1082  // Write + read to/from tensor.
1083  //      CHECK: vector.transfer_write
1084  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
1085  %1 = vector.transfer_write %v1, %t1[%idx2] : vector<5xf32>, tensor<?xf32>
1086  %cst = arith.constant 0.0 : f32
1087  %r1 = vector.transfer_read %1[%idx3], %cst : tensor<?xf32>, vector<5xf32>
1088
1089  // Write + read to/from same memref.
1090  %0 = bufferization.to_memref %t1 : memref<?xf32>
1091  vector.transfer_write %v1, %0[%idx1] : vector<5xf32>, memref<?xf32>
1092  %r2 = vector.transfer_read %0[%idx3], %cst : memref<?xf32>, vector<5xf32>
1093
1094  return %r1, %r2 : vector<5xf32>, vector<5xf32>
1095}
1096
1097// -----
1098
1099// CHECK-LABEL: func @inner_func
1100func.func @inner_func(%t: tensor<?xf32>) -> tensor<?xf32> {
1101  //      CHECK: return
1102  // CHECK-SAME: __equivalent_func_args__ = [0]
1103  return %t : tensor<?xf32>
1104}
1105
1106func.func @equivalent_func_arg(%c0: index, %c10: index, %c1: index, %t0: tensor<?xf32>) -> tensor<?xf32> {
1107  // This test does not check IR. It just asserts there is no failure due to
1108  // non-equivalent scf.for yield values.
1109  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
1110    %3 = func.call @inner_func(%t1) : (tensor<?xf32>) -> tensor<?xf32>
1111    scf.yield %3 : tensor<?xf32>
1112  }
1113  return %1: tensor<?xf32>
1114}
1115
1116// -----
1117
1118// CHECK-LABEL: func @inner_func_2
1119func.func @inner_func_2(%t: tensor<?xf32>) -> tensor<?xf32> {
1120  %f = arith.constant 1.0 : f32
1121  %c0 = arith.constant 0 : index
1122  %0 = tensor.insert %f into %t[%c0] : tensor<?xf32>
1123  //      CHECK: return
1124  // CHECK-SAME: __equivalent_func_args__ = [0]
1125  return %0 : tensor<?xf32>
1126}
1127
1128func.func @equivalent_func_arg_2(%c0: index, %c10: index, %c1: index, %t0: tensor<?xf32>) -> tensor<?xf32> {
1129  // This test does not check IR. It just asserts there is no failure due to
1130  // non-equivalent scf.for yield values.
1131  %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor<?xf32>) {
1132    %3 = func.call @inner_func_2(%t1) : (tensor<?xf32>) -> tensor<?xf32>
1133    scf.yield %3 : tensor<?xf32>
1134  }
1135  return %1: tensor<?xf32>
1136}
1137
1138// -----
1139
1140// CHECK-LABEL: func @write_after_select_read_one
1141//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32> {{.*}}, %[[t2:.*]]: tensor<?xf32>
1142func.func @write_after_select_read_one(
1143    %t1 : tensor<?xf32> {bufferization.writable = true},
1144    %t2 : tensor<?xf32> {bufferization.writable = true},
1145    %c : i1)
1146  -> (f32, tensor<?xf32>)
1147{
1148  %cst = arith.constant 0.0 : f32
1149  %idx = arith.constant 0 : index
1150
1151  //      CHECK: arith.select %{{.*}}, %[[t1]], %[[t2]]
1152  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "false", "true"]}
1153  %s = arith.select %c, %t1, %t2 : tensor<?xf32>
1154  //      CHECK: tensor.insert
1155  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
1156  %w = tensor.insert %cst into %s[%idx] : tensor<?xf32>
1157  //      CHECK: tensor.extract
1158  // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "none"]}
1159  %f = tensor.extract %t1[%idx] : tensor<?xf32>
1160
1161  return %f, %w : f32, tensor<?xf32>
1162}
1163
1164// -----
1165
1166// CHECK-LABEL: func @write_after_select_read_both
1167//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32> {{.*}}, %[[t2:.*]]: tensor<?xf32>
1168func.func @write_after_select_read_both(
1169    %t1 : tensor<?xf32> {bufferization.writable = true},
1170    %t2 : tensor<?xf32> {bufferization.writable = true},
1171    %c : i1)
1172  -> (f32, f32, tensor<?xf32>)
1173{
1174  %cst = arith.constant 0.0 : f32
1175  %idx = arith.constant 0 : index
1176
1177  //      CHECK: arith.select %{{.*}}, %[[t1]], %[[t2]]
1178  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "false", "false"]}
1179  %s = arith.select %c, %t1, %t2 : tensor<?xf32>
1180  //      CHECK: tensor.insert
1181  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
1182  %w = tensor.insert %cst into %s[%idx] : tensor<?xf32>
1183  //      CHECK: tensor.extract
1184  // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "none"]}
1185  %f = tensor.extract %t1[%idx] : tensor<?xf32>
1186  //      CHECK: tensor.extract
1187  // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "none"]}
1188  %f2 = tensor.extract %t2[%idx] : tensor<?xf32>
1189
1190  return %f, %f2, %w : f32, f32, tensor<?xf32>
1191}
1192
1193// -----
1194
1195// CHECK-LABEL: func @write_after_select_no_conflict
1196//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32> {{.*}}, %[[t2:.*]]: tensor<?xf32>
1197func.func @write_after_select_no_conflict(
1198    %t1 : tensor<?xf32> {bufferization.writable = true},
1199    %t2 : tensor<?xf32> {bufferization.writable = true},
1200    %c : i1)
1201  -> (f32, tensor<?xf32>)
1202{
1203  %cst = arith.constant 0.0 : f32
1204  %idx = arith.constant 0 : index
1205
1206  //      CHECK: arith.select %{{.*}}, %[[t1]], %[[t2]]
1207  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "true"]}
1208  %s = arith.select %c, %t1, %t2 : tensor<?xf32>
1209  //      CHECK: tensor.insert
1210  // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
1211  %w = tensor.insert %cst into %s[%idx] : tensor<?xf32>
1212  //      CHECK: tensor.extract
1213  // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "none"]}
1214  %f = tensor.extract %w[%idx] : tensor<?xf32>
1215
1216  return %f, %w : f32, tensor<?xf32>
1217}
1218
1219// -----
1220
1221// CHECK-LABEL: func @write_to_same_tensor_in_loop_out_of_place(
1222func.func @write_to_same_tensor_in_loop_out_of_place(
1223    %A : tensor<?xf32> {bufferization.writable = true},
1224    %B : tensor<?xf32> {bufferization.writable = true},
1225    %lb : index, %ub : index, %step : index, %sz: index)
1226  -> (tensor<?xf32>)
1227{
1228  // CHECK: scf.for {{.*}} {
1229  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
1230    %i2 = arith.index_cast %i : index to i32
1231    %i3 = arith.sitofp %i2 : i32 to f32
1232    // The tensor.insert is out-of-place because the %B is written multiple
1233    // times inside a loop.
1234    //      CHECK: tensor.insert
1235    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "false", "none"]}
1236    %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32>
1237    //      CHECK: tensor.insert_slice
1238    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
1239    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
1240    scf.yield %A2 : tensor<?xf32>
1241  }
1242  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
1243
1244  return %r0 : tensor<?xf32>
1245}
1246
1247// -----
1248
1249// CHECK-LABEL: func @write_to_same_alloc_tensor_in_place(
1250func.func @write_to_same_alloc_tensor_in_place(
1251    %A : tensor<?xf32> {bufferization.writable = true},
1252    %lb : index, %ub : index, %step : index, %sz: index, %sz2: index)
1253  -> (tensor<?xf32>)
1254{
1255  %B = bufferization.alloc_tensor(%sz2) : tensor<?xf32>
1256
1257  // CHECK: scf.for {{.*}} {
1258  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
1259    %i2 = arith.index_cast %i : index to i32
1260    %i3 = arith.sitofp %i2 : i32 to f32
1261    // %B is written multiple times inside a loop, but it is an alloc_tensor.
1262    //      CHECK: tensor.insert
1263    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
1264    %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32>
1265    //      CHECK: tensor.insert_slice
1266    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
1267    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
1268    scf.yield %A2 : tensor<?xf32>
1269  }
1270  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
1271
1272  return %r0 : tensor<?xf32>
1273}
1274
1275// -----
1276
1277// CHECK-LABEL: func @write_to_same_alloc_tensor_out_of_place(
1278func.func @write_to_same_alloc_tensor_out_of_place(
1279    %A : tensor<?xf32> {bufferization.writable = true},
1280    %lb : index, %ub : index, %step : index, %sz: index, %sz2: index, %f: f32)
1281  -> (tensor<?xf32>)
1282{
1283  %B = bufferization.alloc_tensor(%sz2) : tensor<?xf32>
1284  %C = tensor.insert %f into %B[%lb] : tensor<?xf32>
1285
1286  // CHECK: scf.for {{.*}} {
1287  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
1288    %i2 = arith.index_cast %i : index to i32
1289    %i3 = arith.sitofp %i2 : i32 to f32
1290    // %C is written multiple times inside a loop. Even though %C aliases with
1291    // an alloc_tensor, out-of-bounds bufferization is necessary because there
1292    // is another alias (%C) outside of the loop.
1293    //      CHECK: tensor.insert
1294    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "false", "none"]}
1295    %B2 = tensor.insert %i3 into %C[%i] : tensor<?xf32>
1296    //      CHECK: tensor.insert_slice
1297    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
1298    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
1299    scf.yield %A2 : tensor<?xf32>
1300  }
1301  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
1302
1303  return %r0 : tensor<?xf32>
1304}
1305