1// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file | FileCheck %s 2 3// The inner reduction loop '%j' is vectorized. 4 5func.func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) { 6 %cst = arith.constant 0.000000e+00 : f32 7 affine.for %i = 0 to 256 { 8 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 9 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 10 %add = arith.addf %red_iter, %ld : f32 11 affine.yield %add : f32 12 } 13 affine.store %final_red, %out[%i] : memref<256xf32> 14 } 15 return 16} 17 18// CHECK-LABEL: @vecdim_reduction 19// CHECK: affine.for %{{.*}} = 0 to 256 { 20// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 21// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 22// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 23// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[ld]] : vector<128xf32> 24// CHECK: affine.yield %[[add]] : vector<128xf32> 25// CHECK: } 26// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32 27// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 28// CHECK: } 29 30// ----- 31 32func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) { 33 %cst = arith.constant 0x7F800000 : f32 34 affine.for %i = 0 to 256 { 35 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 36 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 37 %min = arith.minf %red_iter, %ld : f32 38 affine.yield %min : f32 39 } 40 affine.store %final_red, %out[%i] : memref<256xf32> 41 } 42 return 43} 44 45// CHECK-LABEL: @vecdim_reduction_minf 46// CHECK: affine.for %{{.*}} = 0 to 256 { 47// CHECK: %[[vmax:.*]] = arith.constant dense<0x7F800000> : vector<128xf32> 48// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xf32>) { 49// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 50// CHECK: %[[min:.*]] = arith.minf %[[red_iter]], %[[ld]] : vector<128xf32> 51// CHECK: affine.yield %[[min]] : vector<128xf32> 52// CHECK: } 53// CHECK: %[[final_min:.*]] = vector.reduction <minf>, %[[vred:.*]] : vector<128xf32> into f32 54// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xf32> 55// CHECK: } 56 57// ----- 58 59func.func @vecdim_reduction_maxf(%in: memref<256x512xf32>, %out: memref<256xf32>) { 60 %cst = arith.constant 0xFF800000 : f32 61 affine.for %i = 0 to 256 { 62 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 63 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 64 %max = arith.maxf %red_iter, %ld : f32 65 affine.yield %max : f32 66 } 67 affine.store %final_red, %out[%i] : memref<256xf32> 68 } 69 return 70} 71 72// CHECK-LABEL: @vecdim_reduction_maxf 73// CHECK: affine.for %{{.*}} = 0 to 256 { 74// CHECK: %[[vmin:.*]] = arith.constant dense<0xFF800000> : vector<128xf32> 75// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xf32>) { 76// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 77// CHECK: %[[max:.*]] = arith.maxf %[[red_iter]], %[[ld]] : vector<128xf32> 78// CHECK: affine.yield %[[max]] : vector<128xf32> 79// CHECK: } 80// CHECK: %[[final_max:.*]] = vector.reduction <maxf>, %[[vred:.*]] : vector<128xf32> into f32 81// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xf32> 82// CHECK: } 83 84// ----- 85 86func.func @vecdim_reduction_minsi(%in: memref<256x512xi32>, %out: memref<256xi32>) { 87 %cst = arith.constant 2147483647 : i32 88 affine.for %i = 0 to 256 { 89 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 90 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 91 %min = arith.minsi %red_iter, %ld : i32 92 affine.yield %min : i32 93 } 94 affine.store %final_red, %out[%i] : memref<256xi32> 95 } 96 return 97} 98 99// CHECK-LABEL: @vecdim_reduction_minsi 100// CHECK: affine.for %{{.*}} = 0 to 256 { 101// CHECK: %[[vmax:.*]] = arith.constant dense<2147483647> : vector<128xi32> 102// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) { 103// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 104// CHECK: %[[min:.*]] = arith.minsi %[[red_iter]], %[[ld]] : vector<128xi32> 105// CHECK: affine.yield %[[min]] : vector<128xi32> 106// CHECK: } 107// CHECK: %[[final_min:.*]] = vector.reduction <minsi>, %[[vred:.*]] : vector<128xi32> into i32 108// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xi32> 109// CHECK: } 110 111// ----- 112 113func.func @vecdim_reduction_maxsi(%in: memref<256x512xi32>, %out: memref<256xi32>) { 114 %cst = arith.constant -2147483648 : i32 115 affine.for %i = 0 to 256 { 116 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 117 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 118 %max = arith.maxsi %red_iter, %ld : i32 119 affine.yield %max : i32 120 } 121 affine.store %final_red, %out[%i] : memref<256xi32> 122 } 123 return 124} 125 126// CHECK-LABEL: @vecdim_reduction_maxsi 127// CHECK: affine.for %{{.*}} = 0 to 256 { 128// CHECK: %[[vmin:.*]] = arith.constant dense<-2147483648> : vector<128xi32> 129// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) { 130// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 131// CHECK: %[[max:.*]] = arith.maxsi %[[red_iter]], %[[ld]] : vector<128xi32> 132// CHECK: affine.yield %[[max]] : vector<128xi32> 133// CHECK: } 134// CHECK: %[[final_max:.*]] = vector.reduction <maxsi>, %[[vred:.*]] : vector<128xi32> into i32 135// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xi32> 136// CHECK: } 137 138// ----- 139 140func.func @vecdim_reduction_minui(%in: memref<256x512xi32>, %out: memref<256xi32>) { 141 %cst = arith.constant -1 : i32 142 affine.for %i = 0 to 256 { 143 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 144 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 145 %min = arith.minui %red_iter, %ld : i32 146 affine.yield %min : i32 147 } 148 affine.store %final_red, %out[%i] : memref<256xi32> 149 } 150 return 151} 152 153// CHECK-LABEL: @vecdim_reduction_minui 154// CHECK: affine.for %{{.*}} = 0 to 256 { 155// CHECK: %[[vmax:.*]] = arith.constant dense<-1> : vector<128xi32> 156// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xi32>) { 157// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 158// CHECK: %[[min:.*]] = arith.minui %[[red_iter]], %[[ld]] : vector<128xi32> 159// CHECK: affine.yield %[[min]] : vector<128xi32> 160// CHECK: } 161// CHECK: %[[final_min:.*]] = vector.reduction <minui>, %[[vred:.*]] : vector<128xi32> into i32 162// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xi32> 163// CHECK: } 164 165// ----- 166 167func.func @vecdim_reduction_maxui(%in: memref<256x512xi32>, %out: memref<256xi32>) { 168 %cst = arith.constant 0 : i32 169 affine.for %i = 0 to 256 { 170 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 171 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 172 %max = arith.maxui %red_iter, %ld : i32 173 affine.yield %max : i32 174 } 175 affine.store %final_red, %out[%i] : memref<256xi32> 176 } 177 return 178} 179 180// CHECK-LABEL: @vecdim_reduction_maxui 181// CHECK: affine.for %{{.*}} = 0 to 256 { 182// CHECK: %[[vmin:.*]] = arith.constant dense<0> : vector<128xi32> 183// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmin]]) -> (vector<128xi32>) { 184// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 185// CHECK: %[[max:.*]] = arith.maxui %[[red_iter]], %[[ld]] : vector<128xi32> 186// CHECK: affine.yield %[[max]] : vector<128xi32> 187// CHECK: } 188// CHECK: %[[final_max:.*]] = vector.reduction <maxui>, %[[vred:.*]] : vector<128xi32> into i32 189// CHECK: affine.store %[[final_max]], %{{.*}} : memref<256xi32> 190// CHECK: } 191 192// ----- 193 194func.func @vecdim_reduction_andi(%in: memref<256x512xi32>, %out: memref<256xi32>) { 195 %cst = arith.constant -1 : i32 196 affine.for %i = 0 to 256 { 197 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 198 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 199 %or = arith.andi %red_iter, %ld : i32 200 affine.yield %or : i32 201 } 202 affine.store %final_red, %out[%i] : memref<256xi32> 203 } 204 return 205} 206 207// CHECK-LABEL: @vecdim_reduction_andi 208// CHECK: affine.for %{{.*}} = 0 to 256 { 209// CHECK: %[[vallone:.*]] = arith.constant dense<-1> : vector<128xi32> 210// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vallone]]) -> (vector<128xi32>) { 211// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 212// CHECK: %[[and:.*]] = arith.andi %[[red_iter]], %[[ld]] : vector<128xi32> 213// CHECK: affine.yield %[[and]] : vector<128xi32> 214// CHECK: } 215// CHECK: %[[final_red:.*]] = vector.reduction <and>, %[[vred:.*]] : vector<128xi32> into i32 216// CHECK: affine.store %[[final_red]], %{{.*}} : memref<256xi32> 217// CHECK: } 218 219// ----- 220 221func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>) { 222 %cst = arith.constant 0 : i32 223 affine.for %i = 0 to 256 { 224 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { 225 %ld = affine.load %in[%i, %j] : memref<256x512xi32> 226 %or = arith.ori %red_iter, %ld : i32 227 affine.yield %or : i32 228 } 229 affine.store %final_red, %out[%i] : memref<256xi32> 230 } 231 return 232} 233 234// CHECK-LABEL: @vecdim_reduction_ori 235// CHECK: affine.for %{{.*}} = 0 to 256 { 236// CHECK: %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32> 237// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) { 238// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi32>, vector<128xi32> 239// CHECK: %[[or:.*]] = arith.ori %[[red_iter]], %[[ld]] : vector<128xi32> 240// CHECK: affine.yield %[[or]] : vector<128xi32> 241// CHECK: } 242// CHECK: %[[final_red:.*]] = vector.reduction <or>, %[[vred:.*]] : vector<128xi32> into i32 243// CHECK: affine.store %[[final_red]], %{{.*}} : memref<256xi32> 244// CHECK: } 245 246 247// ----- 248 249// The inner reduction loop '%j' is vectorized. (The order of addf's operands is 250// different than in the previous test case). 251 252func.func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) { 253 %cst = arith.constant 0.000000e+00 : f32 254 affine.for %i = 0 to 256 { 255 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 256 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 257 %add = arith.addf %ld, %red_iter : f32 258 affine.yield %add : f32 259 } 260 affine.store %final_red, %out[%i] : memref<256xf32> 261 } 262 return 263} 264 265// CHECK-LABEL: @vecdim_reduction_comm 266// CHECK: affine.for %{{.*}} = 0 to 256 { 267// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 268// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 269// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 270// CHECK: %[[add:.*]] = arith.addf %[[ld]], %[[red_iter]] : vector<128xf32> 271// CHECK: affine.yield %[[add]] : vector<128xf32> 272// CHECK: } 273// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32 274// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 275// CHECK: } 276 277// ----- 278 279// The inner reduction loop '%j' is vectorized. Transforming the input before 280// performing the accumulation doesn't cause any problem. 281 282func.func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) { 283 %cst = arith.constant 0.000000e+00 : f32 284 affine.for %i = 0 to 256 { 285 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 286 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 287 %sin = math.sin %ld : f32 288 %exp = math.exp %sin : f32 289 %add = arith.addf %red_iter, %exp : f32 290 affine.yield %add : f32 291 } 292 affine.store %final_red, %out[%i] : memref<256xf32> 293 } 294 return 295} 296 297// CHECK-LABEL: @vecdim_reduction_expsin 298// CHECK: affine.for %{{.*}} = 0 to 256 { 299// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 300// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 301// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 302// CHECK: %[[sin:.*]] = math.sin %[[ld]] 303// CHECK: %[[exp:.*]] = math.exp %[[sin]] 304// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[exp]] : vector<128xf32> 305// CHECK: affine.yield %[[add]] : vector<128xf32> 306// CHECK: } 307// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32 308// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 309// CHECK: } 310 311// ----- 312 313// Two reductions at the same time. The inner reduction loop '%j' is vectorized. 314 315func.func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) { 316 %cst = arith.constant 1.000000e+00 : f32 317 affine.for %i = 0 to 256 { 318 // Note that we pass the same constant '1.0' as initial values for both 319 // reductions. 320 %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) { 321 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 322 %add = arith.addf %part_sum, %ld : f32 323 %mul = arith.mulf %part_prod, %ld : f32 324 affine.yield %add, %mul : f32, f32 325 } 326 affine.store %sum, %out_sum[%i] : memref<256xf32> 327 affine.store %prod, %out_prod[%i] : memref<256xf32> 328 } 329 return 330} 331 332// CHECK-LABEL: @two_vecdim_reductions 333// CHECK: %[[cst:.*]] = arith.constant 1.000000e+00 : f32 334// CHECK: affine.for %{{.*}} = 0 to 256 { 335// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 336// CHECK: %[[vone:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> 337// CHECK: %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) { 338// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 339// CHECK: %[[add:.*]] = arith.addf %[[part_sum]], %[[ld]] : vector<128xf32> 340// CHECK: %[[mul:.*]] = arith.mulf %[[part_prod]], %[[ld]] : vector<128xf32> 341// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32> 342// CHECK: } 343// CHECK: %[[nonfinal_sum:.*]] = vector.reduction <add>, %[[vred:.*]]#0 : vector<128xf32> into f32 344// Note that to compute the final sum we need to add the original initial value 345// (%cst) since it is not zero. 346// CHECK: %[[final_sum:.*]] = arith.addf %[[nonfinal_sum]], %[[cst]] : f32 347// For the final product we don't need to do this additional step because the 348// initial value equals to 1 (the neutral element for multiplication). 349// CHECK: %[[final_prod:.*]] = vector.reduction <mul>, %[[vred:.*]]#1 : vector<128xf32> into f32 350// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 351// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xf32> 352// CHECK: } 353 354// ----- 355 356// The integer case. 357 358func.func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) { 359 %cst0 = arith.constant 0 : i64 360 %cst1 = arith.constant 1 : i64 361 affine.for %i = 0 to 256 { 362 %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) { 363 %ld = affine.load %in[%i, %j] : memref<256x512xi64> 364 %add = arith.addi %part_sum, %ld : i64 365 %mul = arith.muli %part_prod, %ld : i64 366 affine.yield %add, %mul : i64, i64 367 } 368 affine.store %sum, %out_sum[%i] : memref<256xi64> 369 affine.store %prod, %out_prod[%i] : memref<256xi64> 370 } 371 return 372} 373 374// CHECK-LABEL: @two_vecdim_reductions 375// CHECK: affine.for %{{.*}} = 0 to 256 { 376// CHECK: %[[vzero:.*]] = arith.constant dense<0> : vector<128xi64> 377// CHECK: %[[vone:.*]] = arith.constant dense<1> : vector<128xi64> 378// CHECK: %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) { 379// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi64>, vector<128xi64> 380// CHECK: %[[add:.*]] = arith.addi %[[part_sum]], %[[ld]] : vector<128xi64> 381// CHECK: %[[mul:.*]] = arith.muli %[[part_prod]], %[[ld]] : vector<128xi64> 382// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64> 383// CHECK: } 384// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]]#0 : vector<128xi64> into i64 385// CHECK: %[[final_prod:.*]] = vector.reduction <mul>, %[[vred:.*]]#1 : vector<128xi64> into i64 386// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xi64> 387// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xi64> 388// CHECK: } 389 390// ----- 391 392// The outer reduction loop '%j' is vectorized. 393 394func.func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) { 395 %cst = arith.constant 0.000000e+00 : f32 396 %outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) { 397 %inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) { 398 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 399 %add = arith.addf %inner_iter, %ld : f32 400 affine.yield %add : f32 401 } 402 %outer_add = arith.addf %outer_iter, %inner_red : f32 403 affine.yield %outer_add : f32 404 } 405 affine.store %outer_red, %out[0] : memref<1xf32> 406 return 407} 408 409// CHECK-LABEL: @vecdim_reduction_nested 410// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 411// CHECK: %[[outer_red:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 412// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 413// CHECK: %[[inner_red:.*]] = affine.for %{{.*}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 414// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 415// CHECK: %[[add:.*]] = arith.addf %[[inner_iter]], %[[ld]] : vector<128xf32> 416// CHECK: affine.yield %[[add]] : vector<128xf32> 417// CHECK: } 418// CHECK: %[[outer_add:.*]] = arith.addf %[[outer_iter]], %[[inner_red]] : vector<128xf32> 419// CHECK: affine.yield %[[outer_add]] : vector<128xf32> 420// CHECK: } 421// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[outer_red:.*]] : vector<128xf32> into f32 422// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<1xf32> 423 424// ----- 425 426// The inner reduction loop '%j' computes partial sums as a side effect and 427// is not vectorized. 428 429func.func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) { 430 %cst = arith.constant 1.000000e+00 : f32 431 affine.for %i = 0 to 256 { 432 %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) { 433 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 434 %add = arith.addf %part_sum, %ld : f32 435 %mul = arith.mulf %part_prod, %ld : f32 436 affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32> 437 affine.yield %add, %mul : f32, f32 438 } 439 affine.store %sum, %out_sum[%i] : memref<256xf32> 440 affine.store %prod, %out_prod[%i] : memref<256xf32> 441 } 442 return 443} 444 445// CHECK-LABEL: @vecdim_partial_sums_1_rejected 446// CHECK-NOT: vector 447 448// ----- 449 450// The inner reduction loop '%j' computes partial sums as a side effect and 451// is not vectorized. 452 453func.func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) { 454 %cst = arith.constant 1.000000e+00 : f32 455 affine.for %i = 0 to 256 { 456 %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) { 457 affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32> 458 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 459 %add = arith.addf %part_sum, %ld : f32 460 %mul = arith.mulf %part_prod, %ld : f32 461 affine.yield %add, %mul : f32, f32 462 } 463 affine.store %sum, %out_sum[%i] : memref<256xf32> 464 affine.store %prod, %out_prod[%i] : memref<256xf32> 465 } 466 return 467} 468 469// CHECK-LABEL: @vecdim_partial_sums_2_rejected 470// CHECK-NOT: vector 471 472// ----- 473 474// The inner reduction loop '%j' performs an unknown reduction operation and is 475// not vectorized. 476 477func.func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) { 478 %cst = arith.constant 1.000000e+00 : f32 479 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 480 %add = arith.addf %red_iter, %red_iter : f32 481 affine.yield %add : f32 482 } 483 affine.store %final_red, %out[0] : memref<256xf32> 484 return 485} 486 487// CHECK-LABEL: @vecdim_unknown_reduction_rejected 488// CHECK-NOT: vector 489 490// ----- 491 492// The inner reduction loop '%j' doesn't perform any operation which is not 493// recognized as a standard reduction. 494 495func.func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) { 496 %cst = arith.constant 1.000000e+00 : f32 497 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 498 affine.yield %red_iter : f32 499 } 500 affine.store %final_red, %out[0] : memref<256xf32> 501 return 502} 503 504// CHECK-LABEL: @vecdim_none_reduction_rejected 505// CHECK-NOT: vector 506 507// ----- 508 509// The number of iterations is not divisable by the vector size, so a mask has 510// to be applied to the last update of the accumulator. 511 512func.func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) { 513 %cst = arith.constant 0.000000e+00 : f32 514 affine.for %i = 0 to 256 { 515 %final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) { 516 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 517 %add = arith.addf %red_iter, %ld : f32 518 affine.yield %add : f32 519 } 520 affine.store %final_red, %out[%i] : memref<256xf32> 521 } 522 return 523} 524 525// CHECK: #[[$map0:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)> 526// CHECK-LABEL: @vecdim_reduction_masked 527// CHECK: affine.for %{{.*}} = 0 to 256 { 528// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 529// CHECK: %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 530// CHECK: %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]]) 531// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1> 532// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 533// CHECK: %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32> 534// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32> 535// CHECK: affine.yield %[[add]] : vector<128xf32> 536// CHECK: } 537// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32 538// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 539// CHECK: } 540 541// ----- 542 543// The number of iteration is not known, so a mask has to be applied. 544 545func.func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) { 546 %cst = arith.constant 0.000000e+00 : f32 547 affine.for %i = 0 to 256 { 548 %final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) { 549 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 550 %add = arith.addf %red_iter, %ld : f32 551 affine.yield %add : f32 552 } 553 affine.store %final_red, %out[%i] : memref<256xf32> 554 } 555 return 556} 557 558// CHECK: #[[$map1:.*]] = affine_map<([[d0:.*]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])> 559// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub 560// CHECK: affine.for %{{.*}} = 0 to 256 { 561// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 562// CHECK: %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to %[[bnd:.*]] step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 563// CHECK: %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]] 564// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1> 565// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 566// CHECK: %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32> 567// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32> 568// CHECK: affine.yield %[[add]] : vector<128xf32> 569// CHECK: } 570// CHECK: %[[final_sum:.*]] = vector.reduction <add>, %[[vred:.*]] : vector<128xf32> into f32 571// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32> 572// CHECK: } 573 574// ----- 575 576// The lower bound is nonzero, but the number of iterations is divisible by the 577// vector size, so masking is not needed. 578 579func.func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) { 580 %cst = arith.constant 0.000000e+00 : f32 581 affine.for %i = 0 to 256 { 582 %final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) { 583 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 584 %add = arith.addf %red_iter, %ld : f32 585 affine.yield %add : f32 586 } 587 affine.store %final_red, %out[%i] : memref<256xf32> 588 } 589 return 590} 591 592// CHECK-LABEL: @vecdim_reduction_nonzero_lb 593// CHECK: %{{.*}} = affine.for %{{.*}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) { 594// CHECK-NOT: vector.create_mask 595 596// ----- 597 598// The lower bound is unknown, so we need to create a mask. 599 600func.func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) { 601 %cst = arith.constant 0.000000e+00 : f32 602 affine.for %i = 0 to 256 { 603 %final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) { 604 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 605 %add = arith.addf %red_iter, %ld : f32 606 affine.yield %add : f32 607 } 608 affine.store %final_red, %out[%i] : memref<256xf32> 609 } 610 return 611} 612 613// CHECK: #[[$map2:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 512)> 614// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb 615// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 616// CHECK: %{{.*}} = affine.for %[[iv:.*]] = %[[lb:.*]] to 512 step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) { 617// CHECK: %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]]) 618// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1> 619// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 620// CHECK: %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32> 621// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32> 622// CHECK: affine.yield %[[add]] : vector<128xf32> 623 624// ----- 625 626// The upper bound is a minimum expression. 627 628func.func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) { 629 %cst = arith.constant 0.000000e+00 : f32 630 affine.for %i = 0 to 256 { 631 %final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) { 632 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 633 %add = arith.addf %red_iter, %ld : f32 634 affine.yield %add : f32 635 } 636 affine.store %final_red, %out[%i] : memref<256xf32> 637 } 638 return 639} 640 641// CHECK: #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)> 642// CHECK: #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])> 643// CHECK-LABEL: @vecdim_reduction_complex_ub 644// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 645// CHECK: %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) { 646// CHECK: %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]]) 647// CHECK: %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]]) 648// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1> 649// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 650// CHECK: %[[select:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero]] : vector<128xi1>, vector<128xf32> 651// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[select]] : vector<128xf32> 652// CHECK: affine.yield %[[add]] : vector<128xf32> 653 654// ----- 655 656// The same mask is applied to both reductions. 657 658func.func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) { 659 %cst = arith.constant 0.000000e+00 : f32 660 affine.for %i = 0 to 256 { 661 %final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) { 662 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 663 %exp = math.exp %ld : f32 664 %add = arith.addf %sum_iter, %ld : f32 665 %eadd = arith.addf %expsum_iter, %exp : f32 666 affine.yield %add, %eadd : f32, f32 667 } 668 affine.store %final_sum, %out[2*%i] : memref<512xf32> 669 affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32> 670 } 671 return 672} 673 674// CHECK: #[[$map4:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)> 675// CHECK-LABEL: @vecdim_two_reductions_masked 676// CHECK: affine.for %{{.*}} = 0 to 256 { 677// CHECK: %[[vzero0:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 678// CHECK: %[[vzero1:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 679// CHECK: %{{.*}} = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[sum_iter:.*]] = {{.*}}, %[[esum_iter:.*]] = {{.*}}) -> (vector<128xf32>, vector<128xf32>) { 680// CHECK: %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]]) 681// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1> 682// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32> 683// CHECK: %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32> 684// CHECK: %[[select0:.*]] = arith.select %[[mask]], %[[ld]], %[[vzero0]] : vector<128xi1>, vector<128xf32> 685// CHECK: %[[add:.*]] = arith.addf %[[sum_iter]], %[[select0]] : vector<128xf32> 686// CHECK: %[[select1:.*]] = arith.select %[[mask]], %[[exp]], %[[vzero1]] : vector<128xi1>, vector<128xf32> 687// CHECK: %[[eadd:.*]] = arith.addf %[[esum_iter]], %[[select1]] : vector<128xf32> 688// CHECK: affine.yield %[[add]], %[[eadd]] : vector<128xf32> 689// CHECK: } 690