1// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" -split-input-file | FileCheck %s
2
3// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
4// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
5
6// CHECK-LABEL: func @vec1d_1
7func.func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
8// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
9// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
10// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
11// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
12// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
13// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
14   %c0 = arith.constant 0 : index
15   %c1 = arith.constant 1 : index
16   %c2 = arith.constant 2 : index
17   %M = memref.dim %A, %c0 : memref<?x?xf32>
18   %N = memref.dim %A, %c1 : memref<?x?xf32>
19   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
20
21// CHECK: for {{.*}} step 128
22// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
23// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
24// CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
25// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
26   affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
27     %a0 = affine.load %A[%c0, %c0] : memref<?x?xf32>
28   }
29   return
30}
31
32// -----
33
34// CHECK-LABEL: func @vec1d_2
35func.func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
36// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
37// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
38// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
39// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
40// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
41// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
42   %c0 = arith.constant 0 : index
43   %c1 = arith.constant 1 : index
44   %c2 = arith.constant 2 : index
45   %M = memref.dim %A, %c0 : memref<?x?xf32>
46   %N = memref.dim %A, %c1 : memref<?x?xf32>
47   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
48
49// CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
50// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
51// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref<?x?xf32>, vector<128xf32>
52   affine.for %i3 = 0 to %M { // vectorized
53     %a3 = affine.load %A[%c0, %i3] : memref<?x?xf32>
54   }
55   return
56}
57
58// -----
59
60// CHECK-LABEL: func @vec1d_3
61func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
62// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
63// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
64// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
65// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %arg0, %[[C0]] : memref<?x?xf32>
66// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %arg0, %[[C1]] : memref<?x?xf32>
67// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %arg1, %[[C2]] : memref<?x?x?xf32>
68   %c0 = arith.constant 0 : index
69   %c1 = arith.constant 1 : index
70   %c2 = arith.constant 2 : index
71   %M = memref.dim %A, %c0 : memref<?x?xf32>
72   %N = memref.dim %A, %c1 : memref<?x?xf32>
73   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
74
75// CHECK:for [[IV8:%[arg0-9]+]] = 0 to [[ARG_M]] step 128
76// CHECK-NEXT:   for [[IV9:%[arg0-9]*]] = 0 to [[ARG_N]] {
77// CHECK-NEXT:   %[[APP9_0:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
78// CHECK-NEXT:   %[[APP9_1:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
79// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
80// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
81   affine.for %i8 = 0 to %M { // vectorized
82     affine.for %i9 = 0 to %N {
83       %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
84     }
85   }
86   return
87}
88
89// -----
90
91// CHECK-LABEL: func @vector_add_2d
92func.func @vector_add_2d(%M : index, %N : index) -> f32 {
93  %A = memref.alloc (%M, %N) : memref<?x?xf32, 0>
94  %B = memref.alloc (%M, %N) : memref<?x?xf32, 0>
95  %C = memref.alloc (%M, %N) : memref<?x?xf32, 0>
96  %f1 = arith.constant 1.0 : f32
97  %f2 = arith.constant 2.0 : f32
98  affine.for %i0 = 0 to %M {
99    affine.for %i1 = 0 to %N {
100      // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
101      // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
102      // non-scoped %f1
103      affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
104    }
105  }
106  affine.for %i2 = 0 to %M {
107    affine.for %i3 = 0 to %N {
108      // CHECK: %[[C3:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
109      // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref<?x?xf32>
110      // non-scoped %f2
111      affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
112    }
113  }
114  affine.for %i4 = 0 to %M {
115    affine.for %i5 = 0 to %N {
116      // CHECK: %[[SPLAT2:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
117      // CHECK: %[[SPLAT1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
118      // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
119      // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
120      // CHECK: %[[S5:.*]] = arith.addf %[[A5]], %[[B5]] : vector<128xf32>
121      // CHECK: %[[S6:.*]] = arith.addf %[[S5]], %[[SPLAT1]] : vector<128xf32>
122      // CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[SPLAT2]] : vector<128xf32>
123      // CHECK: %[[S8:.*]] = arith.addf %[[S7]], %[[S6]] : vector<128xf32>
124      // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref<?x?xf32>
125      %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
126      %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
127      %s5 = arith.addf %a5, %b5 : f32
128      // non-scoped %f1
129      %s6 = arith.addf %s5, %f1 : f32
130      // non-scoped %f2
131      %s7 = arith.addf %s5, %f2 : f32
132      // diamond dependency.
133      %s8 = arith.addf %s7, %s6 : f32
134      affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
135    }
136  }
137  %c7 = arith.constant 7 : index
138  %c42 = arith.constant 42 : index
139  %res = affine.load %C[%c7, %c42] : memref<?x?xf32, 0>
140  return %res : f32
141}
142
143// -----
144
145// CHECK-LABEL: func @vec_constant_with_two_users
146func.func @vec_constant_with_two_users(%M : index, %N : index) -> (f32, f32) {
147  %A = memref.alloc (%M, %N) : memref<?x?xf32, 0>
148  %B = memref.alloc (%M) : memref<?xf32, 0>
149  %f1 = arith.constant 1.0 : f32
150  affine.for %i0 = 0 to %M { // vectorized
151    // CHECK:      %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
152    // CHECK-NEXT: affine.for
153    // CHECK-NEXT:   vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
154    affine.for %i1 = 0 to %N {
155      affine.store %f1, %A[%i1, %i0] : memref<?x?xf32, 0>
156    }
157    // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?xf32>
158    affine.store %f1, %B[%i0] : memref<?xf32, 0>
159  }
160  %c12 = arith.constant 12 : index
161  %res1 = affine.load %A[%c12, %c12] : memref<?x?xf32, 0>
162  %res2 = affine.load %B[%c12] : memref<?xf32, 0>
163  return %res1, %res2 : f32, f32
164}
165
166// -----
167
168// CHECK-LABEL: func @vec_block_arg
169func.func @vec_block_arg(%A : memref<32x512xi32>) {
170  // CHECK:      affine.for %[[IV0:[arg0-9]+]] = 0 to 512 step 128 {
171  // CHECK-NEXT:   affine.for %[[IV1:[arg0-9]+]] = 0 to 32 {
172  // CHECK-NEXT:     %[[BROADCAST:.*]] = vector.broadcast %[[IV1]] : index to vector<128xindex>
173  // CHECK-NEXT:     %[[CAST:.*]] = arith.index_cast %[[BROADCAST]] : vector<128xindex> to vector<128xi32>
174  // CHECK-NEXT:     vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] : vector<128xi32>, memref<32x512xi32>
175  affine.for %i = 0 to 512 {  // vectorized
176    affine.for %j = 0 to 32 {
177      %idx = arith.index_cast %j : index to i32
178      affine.store %idx, %A[%j, %i] : memref<32x512xi32>
179    }
180  }
181  return
182}
183
184// -----
185
186// CHECK-DAG: #[[$map0:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0 * 2 + d1 - 1)>
187// CHECK-DAG: #[[$map1:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2)>
188// CHECK-LABEL: func @vec_block_arg_2
189func.func @vec_block_arg_2(%A : memref<?x512xindex>) {
190  %c0 = arith.constant 0 : index
191  %N = memref.dim %A, %c0 : memref<?x512xindex>
192  // CHECK:      affine.for %[[IV0:[arg0-9]+]] = 0 to %{{.*}} {
193  // CHECK-NEXT:   %[[BROADCAST1:.*]] = vector.broadcast %[[IV0]] : index to vector<128xindex>
194  // CHECK-NEXT:   affine.for %[[IV1:[arg0-9]+]] = 0 to 512 step 128 {
195  // CHECK-NOT:      vector.broadcast %[[IV1]]
196  // CHECK:          affine.for %[[IV2:[arg0-9]+]] = 0 to 2 {
197  // CHECK-NEXT:       %[[BROADCAST2:.*]] = vector.broadcast %[[IV2]] : index to vector<128xindex>
198  // CHECK-NEXT:       %[[INDEX1:.*]] = affine.apply #[[$map0]](%[[IV0]], %[[IV2]], %[[IV1]])
199  // CHECK-NEXT:       %[[INDEX2:.*]] = affine.apply #[[$map1]](%[[IV0]], %[[IV2]], %[[IV1]])
200  // CHECK:            %[[LOAD:.*]] = vector.transfer_read %{{.*}}[%[[INDEX1]], %[[INDEX2]]], %{{.*}} : memref<?x512xindex>, vector<128xindex>
201  // CHECK-NEXT:       arith.muli %[[BROADCAST1]], %[[LOAD]] : vector<128xindex>
202  // CHECK-NEXT:       arith.addi %{{.*}}, %[[BROADCAST2]] : vector<128xindex>
203  // CHECK:          }
204  affine.for %i0 = 0 to %N {
205    affine.for %i1 = 0 to 512 { // vectorized
206      affine.for %i2 = 0 to 2 {
207        %0 = affine.load %A[%i0 * 2 + %i2 - 1, %i1] : memref<?x512xindex>
208        %mul = arith.muli %i0, %0 : index
209        %add = arith.addi %mul, %i2 : index
210      }
211    }
212  }
213  return
214}
215
216// -----
217
218// CHECK-LABEL: func @vec_rejected_1
219func.func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
220// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
221// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
222// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
223// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
224// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
225// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
226   %c0 = arith.constant 0 : index
227   %c1 = arith.constant 1 : index
228   %c2 = arith.constant 2 : index
229   %M = memref.dim %A, %c0 : memref<?x?xf32>
230   %N = memref.dim %A, %c1 : memref<?x?xf32>
231   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
232
233// CHECK:for {{.*}} [[ARG_M]] {
234   affine.for %i1 = 0 to %M { // not vectorized
235     %a1 = affine.load %A[%i1, %i1] : memref<?x?xf32>
236   }
237   return
238}
239
240// -----
241
242// CHECK-LABEL: func @vec_rejected_2
243func.func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
244// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
245// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
246// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
247// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
248// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
249// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
250   %c0 = arith.constant 0 : index
251   %c1 = arith.constant 1 : index
252   %c2 = arith.constant 2 : index
253   %M = memref.dim %A, %c0 : memref<?x?xf32>
254   %N = memref.dim %A, %c1 : memref<?x?xf32>
255   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
256
257// CHECK:   affine.for %{{.*}}{{[0-9]*}} = 0 to [[ARG_M]] {
258   affine.for %i2 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
259     %a2 = affine.load %A[%i2, %c0] : memref<?x?xf32>
260   }
261   return
262}
263
264// -----
265
266// CHECK-LABEL: func @vec_rejected_3
267func.func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
268// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
269// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
270// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
271// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
272// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
273// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
274   %c0 = arith.constant 0 : index
275   %c1 = arith.constant 1 : index
276   %c2 = arith.constant 2 : index
277   %M = memref.dim %A, %c0 : memref<?x?xf32>
278   %N = memref.dim %A, %c1 : memref<?x?xf32>
279   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
280
281// CHECK:for [[IV4:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 {
282// CHECK-NEXT:   for [[IV5:%[arg0-9]*]] = 0 to [[ARG_N]] {
283// CHECK-NEXT:     %{{.*}} = arith.constant 0.0{{.*}}: f32
284// CHECK-NEXT:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
285   affine.for %i4 = 0 to %M { // vectorized
286     affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
287       %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
288     }
289   }
290   return
291}
292
293// -----
294
295// CHECK-LABEL: func @vec_rejected_4
296func.func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
297// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
298// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
299// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
300// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
301// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
302// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
303   %c0 = arith.constant 0 : index
304   %c1 = arith.constant 1 : index
305   %c2 = arith.constant 2 : index
306   %M = memref.dim %A, %c0 : memref<?x?xf32>
307   %N = memref.dim %A, %c1 : memref<?x?xf32>
308   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
309
310// CHECK: for [[IV6:%[arg0-9]*]] = 0 to [[ARG_M]] {
311// CHECK-NEXT:   for [[IV7:%[arg0-9]*]] = 0 to [[ARG_N]] {
312   affine.for %i6 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
313     affine.for %i7 = 0 to %N { // not vectorized, can never vectorize
314       %a7 = affine.load %A[%i6 + %i7, %i6] : memref<?x?xf32>
315     }
316   }
317   return
318}
319
320// -----
321
322// CHECK-LABEL: func @vec_rejected_5
323func.func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
324// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
325// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
326// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
327// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
328// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
329// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
330   %c0 = arith.constant 0 : index
331   %c1 = arith.constant 1 : index
332   %c2 = arith.constant 2 : index
333   %M = memref.dim %A, %c0 : memref<?x?xf32>
334   %N = memref.dim %A, %c1 : memref<?x?xf32>
335   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
336
337// CHECK: for [[IV10:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
338// CHECK:   for [[IV11:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
339   affine.for %i10 = 0 to %M { // not vectorized, need per load transposes
340     affine.for %i11 = 0 to %N { // not vectorized, need per load transposes
341       %a11 = affine.load %A[%i10, %i11] : memref<?x?xf32>
342       affine.store %a11, %A[%i11, %i10] : memref<?x?xf32>
343     }
344   }
345   return
346}
347
348// -----
349
350// CHECK-LABEL: func @vec_rejected_6
351func.func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
352// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
353// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
354// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
355// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
356// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
357// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
358   %c0 = arith.constant 0 : index
359   %c1 = arith.constant 1 : index
360   %c2 = arith.constant 2 : index
361   %M = memref.dim %A, %c0 : memref<?x?xf32>
362   %N = memref.dim %A, %c1 : memref<?x?xf32>
363   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
364
365// CHECK: for [[IV12:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
366// CHECK:   for [[IV13:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
367// CHECK:     for [[IV14:%[arg0-9]+]] = 0 to [[ARG_P]] step 128
368   affine.for %i12 = 0 to %M { // not vectorized, can never vectorize
369     affine.for %i13 = 0 to %N { // not vectorized, can never vectorize
370       affine.for %i14 = 0 to %P { // vectorized
371         %a14 = affine.load %B[%i13, %i12 + %i13, %i12 + %i14] : memref<?x?x?xf32>
372       }
373     }
374   }
375   return
376}
377
378// -----
379
380// CHECK-LABEL: func @vec_rejected_7
381func.func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
382// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
383// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
384// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
385// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
386// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
387// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
388   %c0 = arith.constant 0 : index
389   %c1 = arith.constant 1 : index
390   %c2 = arith.constant 2 : index
391   %M = memref.dim %A, %c0 : memref<?x?xf32>
392   %N = memref.dim %A, %c1 : memref<?x?xf32>
393   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
394
395// CHECK:  affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
396   affine.for %i16 = 0 to %M { // not vectorized, can't vectorize a vector load
397     %a16 = memref.alloc(%M) : memref<?xvector<2xf32>>
398     %l16 = affine.load %a16[%i16] : memref<?xvector<2xf32>>
399   }
400   return
401}
402
403// -----
404
405// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
406// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
407
408// CHECK-LABEL: func @vec_rejected_8
409func.func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
410// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
411// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
412// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
413// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
414// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
415// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
416   %c0 = arith.constant 0 : index
417   %c1 = arith.constant 1 : index
418   %c2 = arith.constant 2 : index
419   %M = memref.dim %A, %c0 : memref<?x?xf32>
420   %N = memref.dim %A, %c1 : memref<?x?xf32>
421   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
422
423// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
424// CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
425// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
426// CHECK:     %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
427// CHECK:     %{{.*}} = arith.constant 0.0{{.*}}: f32
428// CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
429   affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
430     affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
431       %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
432     }
433   }
434   return
435}
436
437// -----
438
439// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
440// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
441
442// CHECK-LABEL: func @vec_rejected_9
443func.func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
444// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
445// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
446// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
447// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
448// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
449// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
450   %c0 = arith.constant 0 : index
451   %c1 = arith.constant 1 : index
452   %c2 = arith.constant 2 : index
453   %M = memref.dim %A, %c0 : memref<?x?xf32>
454   %N = memref.dim %A, %c1 : memref<?x?xf32>
455   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
456
457// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
458// CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
459// CHECK:      %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
460// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
461// CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
462// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
463   affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
464     affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
465       %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
466     }
467   }
468   return
469}
470
471// -----
472
473#set0 = affine_set<(i) : (i >= 0)>
474
475// CHECK-LABEL: func @vec_rejected_10
476func.func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
477// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
478// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
479// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
480// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
481// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
482// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
483   %c0 = arith.constant 0 : index
484   %c1 = arith.constant 1 : index
485   %c2 = arith.constant 2 : index
486   %M = memref.dim %A, %c0 : memref<?x?xf32>
487   %N = memref.dim %A, %c1 : memref<?x?xf32>
488   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
489
490// CHECK:  affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} {
491   affine.for %i15 = 0 to %M { // not vectorized due to condition below
492     affine.if #set0(%i15) {
493       %a15 = affine.load %A[%c0, %c0] : memref<?x?xf32>
494     }
495   }
496   return
497}
498
499// -----
500
501// CHECK-LABEL: func @vec_rejected_11
502func.func @vec_rejected_11(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
503  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
504  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
505  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
506  // CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
507  // CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
508  // CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
509  %c0 = arith.constant 0 : index
510  %c1 = arith.constant 1 : index
511  %c2 = arith.constant 2 : index
512  %M = memref.dim %A, %c0 : memref<?x?xf32>
513  %N = memref.dim %A, %c1 : memref<?x?xf32>
514  %P = memref.dim %B, %c2 : memref<?x?x?xf32>
515
516  // CHECK: for [[IV10:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
517  // CHECK:   for [[IV11:%[arg0-9]*]] = 0 to %{{[0-9]*}} {
518  // This is similar to vec_rejected_5, but the order of indices is different.
519  affine.for %i10 = 0 to %M { // not vectorized
520    affine.for %i11 = 0 to %N { // not vectorized
521      %a11 = affine.load %A[%i11, %i10] : memref<?x?xf32>
522      affine.store %a11, %A[%i10, %i11] : memref<?x?xf32>
523    }
524  }
525  return
526}
527
528// -----
529
530// This should not vectorize due to the sequential dependence in the loop.
531// CHECK-LABEL: @vec_rejected_sequential
532func.func @vec_rejected_sequential(%A : memref<?xf32>) {
533  %c0 = arith.constant 0 : index
534  %N = memref.dim %A, %c0 : memref<?xf32>
535  affine.for %i = 0 to %N {
536    // CHECK-NOT: vector
537    %a = affine.load %A[%i] : memref<?xf32>
538    // CHECK-NOT: vector
539    affine.store %a, %A[%i + 1] : memref<?xf32>
540  }
541  return
542}
543
544// -----
545
546// CHECK-LABEL: @vec_no_load_store_ops
547func.func @vec_no_load_store_ops(%a: f32, %b: f32) {
548 %cst = arith.constant 0.000000e+00 : f32
549 affine.for %i = 0 to 128 {
550   %add = arith.addf %a, %b : f32
551 }
552 // CHECK-DAG:  %[[bc1:.*]] = vector.broadcast
553 // CHECK-DAG:  %[[bc0:.*]] = vector.broadcast
554 // CHECK:      affine.for %{{.*}} = 0 to 128 step
555 // CHECK-NEXT:   [[add:.*]] arith.addf %[[bc0]], %[[bc1]]
556
557 return
558}
559
560// -----
561
562// This should not be vectorized due to the unsupported block argument (%i).
563// Support for operands with linear evolution is needed.
564// CHECK-LABEL: @vec_rejected_unsupported_block_arg
565func.func @vec_rejected_unsupported_block_arg(%A : memref<512xi32>) {
566  affine.for %i = 0 to 512 {
567    // CHECK-NOT: vector
568    %idx = arith.index_cast %i : index to i32
569    affine.store %idx, %A[%i] : memref<512xi32>
570  }
571  return
572}
573
574// -----
575
576// '%i' loop is vectorized, including the inner reduction over '%j'.
577
578func.func @vec_non_vecdim_reduction(%in: memref<128x256xf32>, %out: memref<256xf32>) {
579 %cst = arith.constant 0.000000e+00 : f32
580 affine.for %i = 0 to 256 {
581   %final_red = affine.for %j = 0 to 128 iter_args(%red_iter = %cst) -> (f32) {
582     %ld = affine.load %in[%j, %i] : memref<128x256xf32>
583     %add = arith.addf %red_iter, %ld : f32
584     affine.yield %add : f32
585   }
586   affine.store %final_red, %out[%i] : memref<256xf32>
587 }
588 return
589}
590
591// CHECK-LABEL: @vec_non_vecdim_reduction
592// CHECK:       affine.for %{{.*}} = 0 to 256 step 128 {
593// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
594// CHECK:         %[[final_red:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
595// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
596// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[ld]] : vector<128xf32>
597// CHECK:           affine.yield %[[add]] : vector<128xf32>
598// CHECK:         }
599// CHECK:         vector.transfer_write %[[final_red]], %{{.*}} : vector<128xf32>, memref<256xf32>
600// CHECK:       }
601
602// -----
603
604// '%i' loop is vectorized, including the inner reductions over '%j'.
605
606func.func @vec_non_vecdim_reductions(%in0: memref<128x256xf32>, %in1: memref<128x256xi32>,
607                                %out0: memref<256xf32>, %out1: memref<256xi32>) {
608 %zero = arith.constant 0.000000e+00 : f32
609 %one = arith.constant 1 : i32
610 affine.for %i = 0 to 256 {
611   %red0, %red1 = affine.for %j = 0 to 128
612     iter_args(%red_iter0 = %zero, %red_iter1 = %one) -> (f32, i32) {
613     %ld0 = affine.load %in0[%j, %i] : memref<128x256xf32>
614     %add = arith.addf %red_iter0, %ld0 : f32
615     %ld1 = affine.load %in1[%j, %i] : memref<128x256xi32>
616     %mul = arith.muli %red_iter1, %ld1 : i32
617     affine.yield %add, %mul : f32, i32
618   }
619   affine.store %red0, %out0[%i] : memref<256xf32>
620   affine.store %red1, %out1[%i] : memref<256xi32>
621 }
622 return
623}
624
625// CHECK-LABEL: @vec_non_vecdim_reductions
626// CHECK:       affine.for %{{.*}} = 0 to 256 step 128 {
627// CHECK:         %[[vone:.*]] = arith.constant dense<1> : vector<128xi32>
628// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
629// CHECK:         %[[reds:.*]]:2 = affine.for %{{.*}} = 0 to 128
630// CHECK-SAME:      iter_args(%[[red_iter0:.*]] = %[[vzero]], %[[red_iter1:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xi32>) {
631// CHECK:           %[[ld0:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
632// CHECK:           %[[add:.*]] = arith.addf %[[red_iter0]], %[[ld0]] : vector<128xf32>
633// CHECK:           %[[ld1:.*]] = vector.transfer_read %{{.*}} : memref<128x256xi32>, vector<128xi32>
634// CHECK:           %[[mul:.*]] = arith.muli %[[red_iter1]], %[[ld1]] : vector<128xi32>
635// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xi32>
636// CHECK:         }
637// CHECK:         vector.transfer_write %[[reds]]#0, %{{.*}} : vector<128xf32>, memref<256xf32>
638// CHECK:         vector.transfer_write %[[reds]]#1, %{{.*}} : vector<128xi32>, memref<256xi32>
639// CHECK:       }
640
641// -----
642
643// '%i' loop is vectorized, including the inner last value computation over '%j'.
644
645func.func @vec_no_vecdim_last_value(%in: memref<128x256xf32>, %out: memref<256xf32>) {
646 %cst = arith.constant 0.000000e+00 : f32
647 affine.for %i = 0 to 256 {
648   %last_val = affine.for %j = 0 to 128 iter_args(%last_iter = %cst) -> (f32) {
649     %ld = affine.load %in[%j, %i] : memref<128x256xf32>
650     affine.yield %ld : f32
651   }
652   affine.store %last_val, %out[%i] : memref<256xf32>
653 }
654 return
655}
656
657// CHECK-LABEL: @vec_no_vecdim_last_value
658// CHECK:       affine.for %{{.*}} = 0 to 256 step 128 {
659// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
660// CHECK:         %[[last_val:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
661// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
662// CHECK:           affine.yield %[[ld]] : vector<128xf32>
663// CHECK:         }
664// CHECK:         vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
665// CHECK:       }
666
667// -----
668
669// The inner reduction loop '%j' is not vectorized if we do not request
670// reduction vectorization.
671
672func.func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
673 %cst = arith.constant 0.000000e+00 : f32
674 affine.for %i = 0 to 256 {
675   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
676     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
677     %add = arith.addf %red_iter, %ld : f32
678     affine.yield %add : f32
679   }
680   affine.store %final_red, %out[%i] : memref<256xf32>
681 }
682 return
683}
684
685// CHECK-LABEL: @vec_vecdim_reduction_rejected
686// CHECK-NOT: vector
687