1// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file | FileCheck %s
2
3// The inner reduction loop '%j' is vectorized.
4
5func.func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
6 %cst = arith.constant 0.000000e+00 : f32
7 affine.for %i = 0 to 256 {
8   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
9     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
10     %add = arith.addf %red_iter, %ld : f32
11     affine.yield %add : f32
12   }
13   affine.store %final_red, %out[%i] : memref<256xf32>
14 }
15 return
16}
17
18// CHECK-LABEL: @vecdim_reduction
19// CHECK:       affine.for %{{.*}} = 0 to 256 {
20// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
21// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
22// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
23// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[ld]] : vector<128xf32>
24// CHECK:           affine.yield %[[add]] : vector<128xf32>
25// CHECK:         }
26// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32
27// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
28// CHECK:       }
29
30// -----
31
32func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
33 %cst = arith.constant 0x7F800000 : f32
34 affine.for %i = 0 to 256 {
35   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
36     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
37     %min = arith.minf %red_iter, %ld : f32
38     affine.yield %min : f32
39   }
40   affine.store %final_red, %out[%i] : memref<256xf32>
41 }
42 return
43}
44
45// CHECK-LABEL: @vecdim_reduction_minf
46// CHECK:       affine.for %{{.*}} = 0 to 256 {
47// CHECK:         %[[vmax:.*]] = arith.constant dense<0x7F800000> : vector<128xf32>
48// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xf32>) {
49// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
50// CHECK:           %[[min:.*]] = arith.minf %[[red_iter]], %[[ld]] : vector<128xf32>
51// CHECK:           affine.yield %[[min]] : vector<128xf32>
52// CHECK:         }
53// CHECK:         %[[final_min:.*]] = vector.reduction <minf>, %[[vred:.*]] : vector<128xf32> into f32
54// CHECK:         affine.store %[[final_min]], %{{.*}} : memref<256xf32>
55// CHECK:       }
56
57// -----
58
59func.func @vecdim_reduction_maxf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
60 %cst = arith.constant 0xFF800000 : f32
61 affine.for %i = 0 to 256 {
62   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
63     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
64     %max = arith.maxf %red_iter, %ld : f32
65     affine.yield %max : f32
66   }
67   affine.store %final_red, %out[%i] : memref<256xf32>
68 }
69 return
70}
71
72// CHECK-LABEL: @vecdim_reduction_maxf
73// CHECK:       affine.for %{{.*}} = 0 to 256 {
74// CHECK:         %[[vmin:.*]] = arith.constant dense<0xFF800000> : vector<128xf32>
75// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xf32>) {
76// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
77// CHECK:           %[[max:.*]] = arith.maxf %[[red_iter]], %[[ld]] : vector<128xf32>
78// CHECK:           affine.yield %[[max]] : vector<128xf32>
79// CHECK:         }
80// CHECK:         %[[final_max:.*]] = vector.reduction <maxf>, %[[vred:.*]] : vector<128xf32> into f32
81// CHECK:         affine.store %[[final_max]], %{{.*}} : memref<256xf32>
82// CHECK:       }
83
84// -----
85
86func.func @vecdim_reduction_minsi(%in: memref<256x512xi32>, %out: memref<256xi32>) {
87 %cst = arith.constant 2147483647 : i32
88 affine.for %i = 0 to 256 {
89   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
90     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
91     %min = arith.minsi %red_iter, %ld : i32
92     affine.yield %min : i32
93   }
94   affine.store %final_red, %out[%i] : memref<256xi32>
95 }
96 return
97}
98
99// CHECK-LABEL: @vecdim_reduction_minsi
100// CHECK:       affine.for %{{.*}} = 0 to 256 {
101// CHECK:         %[[vmax:.*]] = arith.constant dense<2147483647> : vector<128xi32>
102// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) {
103// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
104// CHECK:           %[[min:.*]] = arith.minsi %[[red_iter]], %[[ld]] : vector<128xi32>
105// CHECK:           affine.yield %[[min]] : vector<128xi32>
106// CHECK:         }
107// CHECK:         %[[final_min:.*]] = vector.reduction <minsi>, %[[vred:.*]] : vector<128xi32> into i32
108// CHECK:         affine.store %[[final_min]], %{{.*}} : memref<256xi32>
109// CHECK:       }
110
111// -----
112
113func.func @vecdim_reduction_maxsi(%in: memref<256x512xi32>, %out: memref<256xi32>) {
114 %cst = arith.constant -2147483648 : i32
115 affine.for %i = 0 to 256 {
116   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
117     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
118     %max = arith.maxsi %red_iter, %ld : i32
119     affine.yield %max : i32
120   }
121   affine.store %final_red, %out[%i] : memref<256xi32>
122 }
123 return
124}
125
126// CHECK-LABEL: @vecdim_reduction_maxsi
127// CHECK:       affine.for %{{.*}} = 0 to 256 {
128// CHECK:         %[[vmin:.*]] = arith.constant dense<-2147483648> : vector<128xi32>
129// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) {
130// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
131// CHECK:           %[[max:.*]] = arith.maxsi %[[red_iter]], %[[ld]] : vector<128xi32>
132// CHECK:           affine.yield %[[max]] : vector<128xi32>
133// CHECK:         }
134// CHECK:         %[[final_max:.*]] = vector.reduction <maxsi>, %[[vred:.*]] : vector<128xi32> into i32
135// CHECK:         affine.store %[[final_max]], %{{.*}} : memref<256xi32>
136// CHECK:       }
137
138// -----
139
140func.func @vecdim_reduction_minui(%in: memref<256x512xi32>, %out: memref<256xi32>) {
141 %cst = arith.constant -1 : i32
142 affine.for %i = 0 to 256 {
143   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
144     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
145     %min = arith.minui %red_iter, %ld : i32
146     affine.yield %min : i32
147   }
148   affine.store %final_red, %out[%i] : memref<256xi32>
149 }
150 return
151}
152
153// CHECK-LABEL: @vecdim_reduction_minui
154// CHECK:       affine.for %{{.*}} = 0 to 256 {
155// CHECK:         %[[vmax:.*]] = arith.constant dense<-1> : vector<128xi32>
156// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) {
157// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
158// CHECK:           %[[min:.*]] = arith.minui %[[red_iter]], %[[ld]] : vector<128xi32>
159// CHECK:           affine.yield %[[min]] : vector<128xi32>
160// CHECK:         }
161// CHECK:         %[[final_min:.*]] = vector.reduction <minui>, %[[vred:.*]] : vector<128xi32> into i32
162// CHECK:         affine.store %[[final_min]], %{{.*}} : memref<256xi32>
163// CHECK:       }
164
165// -----
166
167func.func @vecdim_reduction_maxui(%in: memref<256x512xi32>, %out: memref<256xi32>) {
168 %cst = arith.constant 0 : i32
169 affine.for %i = 0 to 256 {
170   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
171     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
172     %max = arith.maxui %red_iter, %ld : i32
173     affine.yield %max : i32
174   }
175   affine.store %final_red, %out[%i] : memref<256xi32>
176 }
177 return
178}
179
180// CHECK-LABEL: @vecdim_reduction_maxui
181// CHECK:       affine.for %{{.*}} = 0 to 256 {
182// CHECK:         %[[vmin:.*]] = arith.constant dense<0> : vector<128xi32>
183// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) {
184// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
185// CHECK:           %[[max:.*]] = arith.maxui %[[red_iter]], %[[ld]] : vector<128xi32>
186// CHECK:           affine.yield %[[max]] : vector<128xi32>
187// CHECK:         }
188// CHECK:         %[[final_max:.*]] = vector.reduction <maxui>, %[[vred:.*]] : vector<128xi32> into i32
189// CHECK:         affine.store %[[final_max]], %{{.*}} : memref<256xi32>
190// CHECK:       }
191
192// -----
193
194func.func @vecdim_reduction_andi(%in: memref<256x512xi32>, %out: memref<256xi32>) {
195 %cst = arith.constant -1 : i32
196 affine.for %i = 0 to 256 {
197   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
198     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
199     %or = arith.andi %red_iter, %ld : i32
200     affine.yield %or : i32
201   }
202   affine.store %final_red, %out[%i] : memref<256xi32>
203 }
204 return
205}
206
207// CHECK-LABEL: @vecdim_reduction_andi
208// CHECK:       affine.for %{{.*}} = 0 to 256 {
209// CHECK:         %[[vallone:.*]] = arith.constant dense<-1> : vector<128xi32>
210// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vallone]]) -> (vector<128xi32>) {
211// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
212// CHECK:           %[[and:.*]] = arith.andi %[[red_iter]], %[[ld]] : vector<128xi32>
213// CHECK:           affine.yield %[[and]] : vector<128xi32>
214// CHECK:         }
215// CHECK:         %[[final_red:.*]] = vector.reduction <and>, %[[vred:.*]] : vector<128xi32> into i32
216// CHECK:         affine.store %[[final_red]], %{{.*}} : memref<256xi32>
217// CHECK:       }
218
219// -----
220
221func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>) {
222 %cst = arith.constant 0 : i32
223 affine.for %i = 0 to 256 {
224   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
225     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
226     %or = arith.ori %red_iter, %ld : i32
227     affine.yield %or : i32
228   }
229   affine.store %final_red, %out[%i] : memref<256xi32>
230 }
231 return
232}
233
234// CHECK-LABEL: @vecdim_reduction_ori
235// CHECK:       affine.for %{{.*}} = 0 to 256 {
236// CHECK:         %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32>
237// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) {
238// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32>
239// CHECK:           %[[or:.*]] = arith.ori %[[red_iter]], %[[ld]] : vector<128xi32>
240// CHECK:           affine.yield %[[or]] : vector<128xi32>
241// CHECK:         }
242// CHECK:         %[[final_red:.*]] = vector.reduction <or>, %[[vred:.*]] : vector<128xi32> into i32
243// CHECK:         affine.store %[[final_red]], %{{.*}} : memref<256xi32>
244// CHECK:       }
245
246
247// -----
248
249// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
250// different than in the previous test case).
251
252func.func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
253 %cst = arith.constant 0.000000e+00 : f32
254 affine.for %i = 0 to 256 {
255   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
256     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
257     %add = arith.addf %ld, %red_iter : f32
258     affine.yield %add : f32
259   }
260   affine.store %final_red, %out[%i] : memref<256xf32>
261 }
262 return
263}
264
265// CHECK-LABEL: @vecdim_reduction_comm
266// CHECK:       affine.for %{{.*}} = 0 to 256 {
267// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
268// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
269// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
270// CHECK:           %[[add:.*]] = arith.addf %[[ld]], %[[red_iter]] : vector<128xf32>
271// CHECK:           affine.yield %[[add]] : vector<128xf32>
272// CHECK:         }
273// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32
274// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
275// CHECK:       }
276
277// -----
278
279// The inner reduction loop '%j' is vectorized. Transforming the input before
280// performing the accumulation doesn't cause any problem.
281
282func.func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
283 %cst = arith.constant 0.000000e+00 : f32
284 affine.for %i = 0 to 256 {
285   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
286     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
287     %sin = math.sin %ld : f32
288     %exp = math.exp %sin : f32
289     %add = arith.addf %red_iter, %exp : f32
290     affine.yield %add : f32
291   }
292   affine.store %final_red, %out[%i] : memref<256xf32>
293 }
294 return
295}
296
297// CHECK-LABEL: @vecdim_reduction_expsin
298// CHECK:       affine.for %{{.*}} = 0 to 256 {
299// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
300// CHECK:         %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
301// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
302// CHECK:           %[[sin:.*]] = math.sin %[[ld]]
303// CHECK:           %[[exp:.*]] = math.exp %[[sin]]
304// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[exp]] : vector<128xf32>
305// CHECK:           affine.yield %[[add]] : vector<128xf32>
306// CHECK:         }
307// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32
308// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
309// CHECK:       }
310
311// -----
312
313// Two reductions at the same time. The inner reduction loop '%j' is vectorized.
314
315func.func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
316 %cst = arith.constant 1.000000e+00 : f32
317 affine.for %i = 0 to 256 {
318   // Note that we pass the same constant '1.0' as initial values for both
319   // reductions.
320   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
321     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
322     %add = arith.addf %part_sum, %ld : f32
323     %mul = arith.mulf %part_prod, %ld : f32
324     affine.yield %add, %mul : f32, f32
325   }
326   affine.store %sum, %out_sum[%i] : memref<256xf32>
327   affine.store %prod, %out_prod[%i] : memref<256xf32>
328 }
329 return
330}
331
332// CHECK-LABEL: @two_vecdim_reductions
333// CHECK:       %[[cst:.*]] = arith.constant 1.000000e+00 : f32
334// CHECK:       affine.for %{{.*}} = 0 to 256 {
335// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
336// CHECK:         %[[vone:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
337// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
338// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
339// CHECK:           %[[add:.*]] = arith.addf %[[part_sum]], %[[ld]] : vector<128xf32>
340// CHECK:           %[[mul:.*]] = arith.mulf %[[part_prod]], %[[ld]] : vector<128xf32>
341// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
342// CHECK:         }
343// CHECK:         %[[nonfinal_sum:.*]] = vector.reduction <add>, %[[vred:.*]]#0 : vector<128xf32> into f32
344// Note that to compute the final sum we need to add the original initial value
345// (%cst) since it is not zero.
346// CHECK:         %[[final_sum:.*]] = arith.addf %[[nonfinal_sum]], %[[cst]] : f32
347// For the final product we don't need to do this additional step because the
348// initial value equals to 1 (the neutral element for multiplication).
349// CHECK:         %[[final_prod:.*]] = vector.reduction <mul>, %[[vred:.*]]#1 : vector<128xf32> into f32
350// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
351// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
352// CHECK:       }
353
354// -----
355
356// The integer case.
357
358func.func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
359 %cst0 = arith.constant 0 : i64
360 %cst1 = arith.constant 1 : i64
361 affine.for %i = 0 to 256 {
362   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
363     %ld = affine.load %in[%i, %j] : memref<256x512xi64>
364     %add = arith.addi %part_sum, %ld : i64
365     %mul = arith.muli %part_prod, %ld : i64
366     affine.yield %add, %mul : i64, i64
367   }
368   affine.store %sum, %out_sum[%i] : memref<256xi64>
369   affine.store %prod, %out_prod[%i] : memref<256xi64>
370 }
371 return
372}
373
374// CHECK-LABEL: @two_vecdim_reductions
375// CHECK:       affine.for %{{.*}} = 0 to 256 {
376// CHECK:         %[[vzero:.*]] = arith.constant dense<0> : vector<128xi64>
377// CHECK:         %[[vone:.*]] = arith.constant dense<1> : vector<128xi64>
378// CHECK:         %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
379// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi64>, vector<128xi64>
380// CHECK:           %[[add:.*]] = arith.addi %[[part_sum]], %[[ld]] : vector<128xi64>
381// CHECK:           %[[mul:.*]] = arith.muli %[[part_prod]], %[[ld]] : vector<128xi64>
382// CHECK:           affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
383// CHECK:         }
384// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]]#0 : vector<128xi64> into i64
385// CHECK:         %[[final_prod:.*]] = vector.reduction <mul>, %[[vred:.*]]#1 : vector<128xi64> into i64
386// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
387// CHECK:         affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
388// CHECK:       }
389
390// -----
391
392// The outer reduction loop '%j' is vectorized.
393
394func.func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
395 %cst = arith.constant 0.000000e+00 : f32
396 %outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
397   %inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
398     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
399     %add = arith.addf %inner_iter, %ld : f32
400     affine.yield %add : f32
401   }
402   %outer_add = arith.addf %outer_iter, %inner_red : f32
403   affine.yield %outer_add : f32
404 }
405 affine.store %outer_red, %out[0] : memref<1xf32>
406 return
407}
408
409// CHECK-LABEL: @vecdim_reduction_nested
410// CHECK:       %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
411// CHECK:       %[[outer_red:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
412// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
413// CHECK:         %[[inner_red:.*]] = affine.for %{{.*}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
414// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
415// CHECK:           %[[add:.*]] = arith.addf %[[inner_iter]], %[[ld]] : vector<128xf32>
416// CHECK:           affine.yield %[[add]] : vector<128xf32>
417// CHECK:         }
418// CHECK:         %[[outer_add:.*]] = arith.addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
419// CHECK:         affine.yield %[[outer_add]] : vector<128xf32>
420// CHECK:       }
421// CHECK:       %[[final_sum:.*]] = vector.reduction <add>, %[[outer_red:.*]] : vector<128xf32> into f32
422// CHECK:       affine.store %[[final_sum]], %{{.*}} : memref<1xf32>
423
424// -----
425
426// The inner reduction loop '%j' computes partial sums as a side effect and
427// is not vectorized.
428
429func.func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
430 %cst = arith.constant 1.000000e+00 : f32
431 affine.for %i = 0 to 256 {
432   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
433     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
434     %add = arith.addf %part_sum, %ld : f32
435     %mul = arith.mulf %part_prod, %ld : f32
436     affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
437     affine.yield %add, %mul : f32, f32
438   }
439   affine.store %sum, %out_sum[%i] : memref<256xf32>
440   affine.store %prod, %out_prod[%i] : memref<256xf32>
441 }
442 return
443}
444
445// CHECK-LABEL: @vecdim_partial_sums_1_rejected
446// CHECK-NOT:   vector
447
448// -----
449
450// The inner reduction loop '%j' computes partial sums as a side effect and
451// is not vectorized.
452
453func.func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
454 %cst = arith.constant 1.000000e+00 : f32
455 affine.for %i = 0 to 256 {
456   %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
457     affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
458     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
459     %add = arith.addf %part_sum, %ld : f32
460     %mul = arith.mulf %part_prod, %ld : f32
461     affine.yield %add, %mul : f32, f32
462   }
463   affine.store %sum, %out_sum[%i] : memref<256xf32>
464   affine.store %prod, %out_prod[%i] : memref<256xf32>
465 }
466 return
467}
468
469// CHECK-LABEL: @vecdim_partial_sums_2_rejected
470// CHECK-NOT:   vector
471
472// -----
473
474// The inner reduction loop '%j' performs an unknown reduction operation and is
475// not vectorized.
476
477func.func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
478 %cst = arith.constant 1.000000e+00 : f32
479 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
480   %add = arith.addf %red_iter, %red_iter : f32
481   affine.yield %add : f32
482 }
483 affine.store %final_red, %out[0] : memref<256xf32>
484 return
485}
486
487// CHECK-LABEL: @vecdim_unknown_reduction_rejected
488// CHECK-NOT:   vector
489
490// -----
491
492// The inner reduction loop '%j' doesn't perform any operation which is not
493// recognized as a standard reduction.
494
495func.func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
496 %cst = arith.constant 1.000000e+00 : f32
497 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
498   affine.yield %red_iter : f32
499 }
500 affine.store %final_red, %out[0] : memref<256xf32>
501 return
502}
503
504// CHECK-LABEL: @vecdim_none_reduction_rejected
505// CHECK-NOT:   vector
506
507// -----
508
509// The number of iterations is not divisable by the vector size, so a mask has
510// to be applied to the last update of the accumulator.
511
512func.func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
513 %cst = arith.constant 0.000000e+00 : f32
514 affine.for %i = 0 to 256 {
515   %final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
516     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
517     %add = arith.addf %red_iter, %ld : f32
518     affine.yield %add : f32
519   }
520   affine.store %final_red, %out[%i] : memref<256xf32>
521 }
522 return
523}
524
525// CHECK:       #[[$map0:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
526// CHECK-LABEL: @vecdim_reduction_masked
527// CHECK:       affine.for %{{.*}} = 0 to 256 {
528// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
529// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
530// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
531// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
532// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
533// CHECK:           %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32>
534// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32>
535// CHECK:           affine.yield %[[add]] : vector<128xf32>
536// CHECK:         }
537// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32
538// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
539// CHECK:       }
540
541// -----
542
543// The number of iteration is not known, so a mask has to be applied.
544
545func.func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
546 %cst = arith.constant 0.000000e+00 : f32
547 affine.for %i = 0 to 256 {
548   %final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
549     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
550     %add = arith.addf %red_iter, %ld : f32
551     affine.yield %add : f32
552   }
553   affine.store %final_red, %out[%i] : memref<256xf32>
554 }
555 return
556}
557
558// CHECK:       #[[$map1:.*]] = affine_map<([[d0:.*]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
559// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
560// CHECK:       affine.for %{{.*}} = 0 to 256 {
561// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
562// CHECK:         %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to %[[bnd:.*]] step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
563// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
564// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
565// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
566// CHECK:           %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32>
567// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32>
568// CHECK:           affine.yield %[[add]] : vector<128xf32>
569// CHECK:         }
570// CHECK:         %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32
571// CHECK:         affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
572// CHECK:       }
573
574// -----
575
576// The lower bound is nonzero, but the number of iterations is divisible by the
577// vector size, so masking is not needed.
578
579func.func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
580 %cst = arith.constant 0.000000e+00 : f32
581 affine.for %i = 0 to 256 {
582   %final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
583     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
584     %add = arith.addf %red_iter, %ld : f32
585     affine.yield %add : f32
586   }
587   affine.store %final_red, %out[%i] : memref<256xf32>
588 }
589 return
590}
591
592// CHECK-LABEL: @vecdim_reduction_nonzero_lb
593// CHECK:         %{{.*}} = affine.for %{{.*}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
594// CHECK-NOT:     vector.create_mask
595
596// -----
597
598// The lower bound is unknown, so we need to create a mask.
599
600func.func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
601 %cst = arith.constant 0.000000e+00 : f32
602 affine.for %i = 0 to 256 {
603   %final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
604     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
605     %add = arith.addf %red_iter, %ld : f32
606     affine.yield %add : f32
607   }
608   affine.store %final_red, %out[%i] : memref<256xf32>
609 }
610 return
611}
612
613// CHECK:       #[[$map2:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 512)>
614// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
615// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
616// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = %[[lb:.*]] to 512 step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
617// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
618// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
619// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
620// CHECK:           %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32>
621// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32>
622// CHECK:           affine.yield %[[add]] : vector<128xf32>
623
624// -----
625
626// The upper bound is a minimum expression.
627
628func.func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
629 %cst = arith.constant 0.000000e+00 : f32
630 affine.for %i = 0 to 256 {
631   %final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
632     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
633     %add = arith.addf %red_iter, %ld : f32
634     affine.yield %add : f32
635   }
636   affine.store %final_red, %out[%i] : memref<256xf32>
637 }
638 return
639}
640
641// CHECK:       #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)>
642// CHECK:       #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
643// CHECK-LABEL: @vecdim_reduction_complex_ub
644// CHECK:         %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
645// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
646// CHECK:           %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
647// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
648// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
649// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
650// CHECK:           %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32>
651// CHECK:           %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32>
652// CHECK:           affine.yield %[[add]] : vector<128xf32>
653
654// -----
655
656// The same mask is applied to both reductions.
657
658func.func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
659 %cst = arith.constant 0.000000e+00 : f32
660 affine.for %i = 0 to 256 {
661   %final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
662     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
663     %exp = math.exp %ld : f32
664     %add = arith.addf %sum_iter, %ld : f32
665     %eadd = arith.addf %expsum_iter, %exp : f32
666     affine.yield %add, %eadd : f32, f32
667   }
668   affine.store %final_sum, %out[2*%i] : memref<512xf32>
669   affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
670 }
671 return
672}
673
674// CHECK:       #[[$map4:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
675// CHECK-LABEL: @vecdim_two_reductions_masked
676// CHECK:       affine.for %{{.*}} = 0 to 256 {
677// CHECK:         %[[vzero0:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
678// CHECK:         %[[vzero1:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
679// CHECK:         %{{.*}} = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[sum_iter:.*]] = {{.*}}, %[[esum_iter:.*]] = {{.*}}) -> (vector<128xf32>, vector<128xf32>) {
680// CHECK:           %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
681// CHECK:           %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
682// CHECK:           %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
683// CHECK:           %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
684// CHECK:           %[[select0:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero0]] : vector<128xi1>, vector<128xf32>
685// CHECK:           %[[add:.*]] = arith.addf %[[sum_iter]], %[[select0]] : vector<128xf32>
686// CHECK:           %[[select1:.*]] = arith.select %[[mask]], %[[exp]], %[[vzero1]] : vector<128xi1>, vector<128xf32>
687// CHECK:           %[[eadd:.*]] = arith.addf %[[esum_iter]], %[[select1]] : vector<128xf32>
688// CHECK:           affine.yield %[[add]], %[[eadd]] : vector<128xf32>
689// CHECK:         }
690