1// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" -split-input-file | FileCheck %s 2 3// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> 4// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> 5 6// CHECK-LABEL: func @vec1d_1 7func.func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 8// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 9// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 10// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 11// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 12// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 13// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 14 %c0 = arith.constant 0 : index 15 %c1 = arith.constant 1 : index 16 %c2 = arith.constant 2 : index 17 %M = memref.dim %A, %c0 : memref<?x?xf32> 18 %N = memref.dim %A, %c1 : memref<?x?xf32> 19 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 20 21// CHECK: for {{.*}} step 128 22// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) 23// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]]) 24// CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 25// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32> 26 affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector 27 %a0 = affine.load %A[%c0, %c0] : memref<?x?xf32> 28 } 29 return 30} 31 32// ----- 33 34// CHECK-LABEL: func @vec1d_2 35func.func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 36// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 37// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 38// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 39// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 40// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 41// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 42 %c0 = arith.constant 0 : index 43 %c1 = arith.constant 1 : index 44 %c2 = arith.constant 2 : index 45 %M = memref.dim %A, %c0 : memref<?x?xf32> 46 %N = memref.dim %A, %c1 : memref<?x?xf32> 47 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 48 49// CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 50// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32 51// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref<?x?xf32>, vector<128xf32> 52 affine.for %i3 = 0 to %M { // vectorized 53 %a3 = affine.load %A[%c0, %i3] : memref<?x?xf32> 54 } 55 return 56} 57 58// ----- 59 60// CHECK-LABEL: func @vec1d_3 61func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 62// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 63// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 64// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 65// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %arg0, %[[C0]] : memref<?x?xf32> 66// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %arg0, %[[C1]] : memref<?x?xf32> 67// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %arg1, %[[C2]] : memref<?x?x?xf32> 68 %c0 = arith.constant 0 : index 69 %c1 = arith.constant 1 : index 70 %c2 = arith.constant 2 : index 71 %M = memref.dim %A, %c0 : memref<?x?xf32> 72 %N = memref.dim %A, %c1 : memref<?x?xf32> 73 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 74 75// CHECK:for [[IV8:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 76// CHECK-NEXT: for [[IV9:%[arg0-9]*]] = 0 to [[ARG_N]] { 77// CHECK-NEXT: %[[APP9_0:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]]) 78// CHECK-NEXT: %[[APP9_1:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]]) 79// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32 80// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32> 81 affine.for %i8 = 0 to %M { // vectorized 82 affine.for %i9 = 0 to %N { 83 %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32> 84 } 85 } 86 return 87} 88 89// ----- 90 91// CHECK-LABEL: func @vector_add_2d 92func.func @vector_add_2d(%M : index, %N : index) -> f32 { 93 %A = memref.alloc (%M, %N) : memref<?x?xf32, 0> 94 %B = memref.alloc (%M, %N) : memref<?x?xf32, 0> 95 %C = memref.alloc (%M, %N) : memref<?x?xf32, 0> 96 %f1 = arith.constant 1.0 : f32 97 %f2 = arith.constant 2.0 : f32 98 affine.for %i0 = 0 to %M { 99 affine.for %i1 = 0 to %N { 100 // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> 101 // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32> 102 // non-scoped %f1 103 affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0> 104 } 105 } 106 affine.for %i2 = 0 to %M { 107 affine.for %i3 = 0 to %N { 108 // CHECK: %[[C3:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32> 109 // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref<?x?xf32> 110 // non-scoped %f2 111 affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0> 112 } 113 } 114 affine.for %i4 = 0 to %M { 115 affine.for %i5 = 0 to %N { 116 // CHECK: %[[SPLAT2:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32> 117 // CHECK: %[[SPLAT1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> 118 // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32> 119 // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32> 120 // CHECK: %[[S5:.*]] = arith.addf %[[A5]], %[[B5]] : vector<128xf32> 121 // CHECK: %[[S6:.*]] = arith.addf %[[S5]], %[[SPLAT1]] : vector<128xf32> 122 // CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[SPLAT2]] : vector<128xf32> 123 // CHECK: %[[S8:.*]] = arith.addf %[[S7]], %[[S6]] : vector<128xf32> 124 // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref<?x?xf32> 125 %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0> 126 %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0> 127 %s5 = arith.addf %a5, %b5 : f32 128 // non-scoped %f1 129 %s6 = arith.addf %s5, %f1 : f32 130 // non-scoped %f2 131 %s7 = arith.addf %s5, %f2 : f32 132 // diamond dependency. 133 %s8 = arith.addf %s7, %s6 : f32 134 affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0> 135 } 136 } 137 %c7 = arith.constant 7 : index 138 %c42 = arith.constant 42 : index 139 %res = affine.load %C[%c7, %c42] : memref<?x?xf32, 0> 140 return %res : f32 141} 142 143// ----- 144 145// CHECK-LABEL: func @vec_constant_with_two_users 146func.func @vec_constant_with_two_users(%M : index, %N : index) -> (f32, f32) { 147 %A = memref.alloc (%M, %N) : memref<?x?xf32, 0> 148 %B = memref.alloc (%M) : memref<?xf32, 0> 149 %f1 = arith.constant 1.0 : f32 150 affine.for %i0 = 0 to %M { // vectorized 151 // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32> 152 // CHECK-NEXT: affine.for 153 // CHECK-NEXT: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32> 154 affine.for %i1 = 0 to %N { 155 affine.store %f1, %A[%i1, %i0] : memref<?x?xf32, 0> 156 } 157 // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?xf32> 158 affine.store %f1, %B[%i0] : memref<?xf32, 0> 159 } 160 %c12 = arith.constant 12 : index 161 %res1 = affine.load %A[%c12, %c12] : memref<?x?xf32, 0> 162 %res2 = affine.load %B[%c12] : memref<?xf32, 0> 163 return %res1, %res2 : f32, f32 164} 165 166// ----- 167 168// CHECK-LABEL: func @vec_block_arg 169func.func @vec_block_arg(%A : memref<32x512xi32>) { 170 // CHECK: affine.for %[[IV0:[arg0-9]+]] = 0 to 512 step 128 { 171 // CHECK-NEXT: affine.for %[[IV1:[arg0-9]+]] = 0 to 32 { 172 // CHECK-NEXT: %[[BROADCAST:.*]] = vector.broadcast %[[IV1]] : index to vector<128xindex> 173 // CHECK-NEXT: %[[CAST:.*]] = arith.index_cast %[[BROADCAST]] : vector<128xindex> to vector<128xi32> 174 // CHECK-NEXT: vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] : vector<128xi32>, memref<32x512xi32> 175 affine.for %i = 0 to 512 { // vectorized 176 affine.for %j = 0 to 32 { 177 %idx = arith.index_cast %j : index to i32 178 affine.store %idx, %A[%j, %i] : memref<32x512xi32> 179 } 180 } 181 return 182} 183 184// ----- 185 186// CHECK-DAG: #[[$map0:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0 * 2 + d1 - 1)> 187// CHECK-DAG: #[[$map1:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2)> 188// CHECK-LABEL: func @vec_block_arg_2 189func.func @vec_block_arg_2(%A : memref<?x512xindex>) { 190 %c0 = arith.constant 0 : index 191 %N = memref.dim %A, %c0 : memref<?x512xindex> 192 // CHECK: affine.for %[[IV0:[arg0-9]+]] = 0 to %{{.*}} { 193 // CHECK-NEXT: %[[BROADCAST1:.*]] = vector.broadcast %[[IV0]] : index to vector<128xindex> 194 // CHECK-NEXT: affine.for %[[IV1:[arg0-9]+]] = 0 to 512 step 128 { 195 // CHECK-NOT: vector.broadcast %[[IV1]] 196 // CHECK: affine.for %[[IV2:[arg0-9]+]] = 0 to 2 { 197 // CHECK-NEXT: %[[BROADCAST2:.*]] = vector.broadcast %[[IV2]] : index to vector<128xindex> 198 // CHECK-NEXT: %[[INDEX1:.*]] = affine.apply #[[$map0]](%[[IV0]], %[[IV2]], %[[IV1]]) 199 // CHECK-NEXT: %[[INDEX2:.*]] = affine.apply #[[$map1]](%[[IV0]], %[[IV2]], %[[IV1]]) 200 // CHECK: %[[LOAD:.*]] = vector.transfer_read %{{.*}}[%[[INDEX1]], %[[INDEX2]]], %{{.*}} : memref<?x512xindex>, vector<128xindex> 201 // CHECK-NEXT: arith.muli %[[BROADCAST1]], %[[LOAD]] : vector<128xindex> 202 // CHECK-NEXT: arith.addi %{{.*}}, %[[BROADCAST2]] : vector<128xindex> 203 // CHECK: } 204 affine.for %i0 = 0 to %N { 205 affine.for %i1 = 0 to 512 { // vectorized 206 affine.for %i2 = 0 to 2 { 207 %0 = affine.load %A[%i0 * 2 + %i2 - 1, %i1] : memref<?x512xindex> 208 %mul = arith.muli %i0, %0 : index 209 %add = arith.addi %mul, %i2 : index 210 } 211 } 212 } 213 return 214} 215 216// ----- 217 218// CHECK-LABEL: func @vec_rejected_1 219func.func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 220// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 221// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 222// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 223// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 224// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 225// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 226 %c0 = arith.constant 0 : index 227 %c1 = arith.constant 1 : index 228 %c2 = arith.constant 2 : index 229 %M = memref.dim %A, %c0 : memref<?x?xf32> 230 %N = memref.dim %A, %c1 : memref<?x?xf32> 231 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 232 233// CHECK:for {{.*}} [[ARG_M]] { 234 affine.for %i1 = 0 to %M { // not vectorized 235 %a1 = affine.load %A[%i1, %i1] : memref<?x?xf32> 236 } 237 return 238} 239 240// ----- 241 242// CHECK-LABEL: func @vec_rejected_2 243func.func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 244// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 245// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 246// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 247// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 248// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 249// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 250 %c0 = arith.constant 0 : index 251 %c1 = arith.constant 1 : index 252 %c2 = arith.constant 2 : index 253 %M = memref.dim %A, %c0 : memref<?x?xf32> 254 %N = memref.dim %A, %c1 : memref<?x?xf32> 255 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 256 257// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to [[ARG_M]] { 258 affine.for %i2 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1 259 %a2 = affine.load %A[%i2, %c0] : memref<?x?xf32> 260 } 261 return 262} 263 264// ----- 265 266// CHECK-LABEL: func @vec_rejected_3 267func.func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 268// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 269// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 270// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 271// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 272// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 273// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 274 %c0 = arith.constant 0 : index 275 %c1 = arith.constant 1 : index 276 %c2 = arith.constant 2 : index 277 %M = memref.dim %A, %c0 : memref<?x?xf32> 278 %N = memref.dim %A, %c1 : memref<?x?xf32> 279 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 280 281// CHECK:for [[IV4:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 { 282// CHECK-NEXT: for [[IV5:%[arg0-9]*]] = 0 to [[ARG_N]] { 283// CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 284// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32> 285 affine.for %i4 = 0 to %M { // vectorized 286 affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1 287 %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32> 288 } 289 } 290 return 291} 292 293// ----- 294 295// CHECK-LABEL: func @vec_rejected_4 296func.func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 297// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 298// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 299// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 300// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 301// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 302// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 303 %c0 = arith.constant 0 : index 304 %c1 = arith.constant 1 : index 305 %c2 = arith.constant 2 : index 306 %M = memref.dim %A, %c0 : memref<?x?xf32> 307 %N = memref.dim %A, %c1 : memref<?x?xf32> 308 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 309 310// CHECK: for [[IV6:%[arg0-9]*]] = 0 to [[ARG_M]] { 311// CHECK-NEXT: for [[IV7:%[arg0-9]*]] = 0 to [[ARG_N]] { 312 affine.for %i6 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1 313 affine.for %i7 = 0 to %N { // not vectorized, can never vectorize 314 %a7 = affine.load %A[%i6 + %i7, %i6] : memref<?x?xf32> 315 } 316 } 317 return 318} 319 320// ----- 321 322// CHECK-LABEL: func @vec_rejected_5 323func.func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 324// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 325// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 326// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 327// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 328// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 329// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 330 %c0 = arith.constant 0 : index 331 %c1 = arith.constant 1 : index 332 %c2 = arith.constant 2 : index 333 %M = memref.dim %A, %c0 : memref<?x?xf32> 334 %N = memref.dim %A, %c1 : memref<?x?xf32> 335 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 336 337// CHECK: for [[IV10:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 338// CHECK: for [[IV11:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 339 affine.for %i10 = 0 to %M { // not vectorized, need per load transposes 340 affine.for %i11 = 0 to %N { // not vectorized, need per load transposes 341 %a11 = affine.load %A[%i10, %i11] : memref<?x?xf32> 342 affine.store %a11, %A[%i11, %i10] : memref<?x?xf32> 343 } 344 } 345 return 346} 347 348// ----- 349 350// CHECK-LABEL: func @vec_rejected_6 351func.func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 352// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 353// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 354// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 355// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 356// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 357// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 358 %c0 = arith.constant 0 : index 359 %c1 = arith.constant 1 : index 360 %c2 = arith.constant 2 : index 361 %M = memref.dim %A, %c0 : memref<?x?xf32> 362 %N = memref.dim %A, %c1 : memref<?x?xf32> 363 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 364 365// CHECK: for [[IV12:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 366// CHECK: for [[IV13:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 367// CHECK: for [[IV14:%[arg0-9]+]] = 0 to [[ARG_P]] step 128 368 affine.for %i12 = 0 to %M { // not vectorized, can never vectorize 369 affine.for %i13 = 0 to %N { // not vectorized, can never vectorize 370 affine.for %i14 = 0 to %P { // vectorized 371 %a14 = affine.load %B[%i13, %i12 + %i13, %i12 + %i14] : memref<?x?x?xf32> 372 } 373 } 374 } 375 return 376} 377 378// ----- 379 380// CHECK-LABEL: func @vec_rejected_7 381func.func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 382// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 383// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 384// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 385// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 386// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 387// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 388 %c0 = arith.constant 0 : index 389 %c1 = arith.constant 1 : index 390 %c2 = arith.constant 2 : index 391 %M = memref.dim %A, %c0 : memref<?x?xf32> 392 %N = memref.dim %A, %c1 : memref<?x?xf32> 393 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 394 395// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { 396 affine.for %i16 = 0 to %M { // not vectorized, can't vectorize a vector load 397 %a16 = memref.alloc(%M) : memref<?xvector<2xf32>> 398 %l16 = affine.load %a16[%i16] : memref<?xvector<2xf32>> 399 } 400 return 401} 402 403// ----- 404 405// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> 406// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> 407 408// CHECK-LABEL: func @vec_rejected_8 409func.func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 410// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 411// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 412// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 413// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 414// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 415// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 416 %c0 = arith.constant 0 : index 417 %c1 = arith.constant 1 : index 418 %c2 = arith.constant 2 : index 419 %M = memref.dim %A, %c0 : memref<?x?xf32> 420 %N = memref.dim %A, %c1 : memref<?x?xf32> 421 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 422 423// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { 424// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 425// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) 426// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) 427// CHECK: %{{.*}} = arith.constant 0.0{{.*}}: f32 428// CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32> 429 affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}} 430 affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector 431 %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32> 432 } 433 } 434 return 435} 436 437// ----- 438 439// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> 440// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> 441 442// CHECK-LABEL: func @vec_rejected_9 443func.func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 444// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 445// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 446// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 447// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 448// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 449// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 450 %c0 = arith.constant 0 : index 451 %c1 = arith.constant 1 : index 452 %c2 = arith.constant 2 : index 453 %M = memref.dim %A, %c0 : memref<?x?xf32> 454 %N = memref.dim %A, %c1 : memref<?x?xf32> 455 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 456 457// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { 458// CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128 459// CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) 460// CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}}) 461// CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32 462// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32> 463 affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}} 464 affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector 465 %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32> 466 } 467 } 468 return 469} 470 471// ----- 472 473#set0 = affine_set<(i) : (i >= 0)> 474 475// CHECK-LABEL: func @vec_rejected_10 476func.func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 477// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 478// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 479// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 480// CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 481// CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 482// CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 483 %c0 = arith.constant 0 : index 484 %c1 = arith.constant 1 : index 485 %c2 = arith.constant 2 : index 486 %M = memref.dim %A, %c0 : memref<?x?xf32> 487 %N = memref.dim %A, %c1 : memref<?x?xf32> 488 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 489 490// CHECK: affine.for %{{.*}}{{[0-9]*}} = 0 to %{{[0-9]*}} { 491 affine.for %i15 = 0 to %M { // not vectorized due to condition below 492 affine.if #set0(%i15) { 493 %a15 = affine.load %A[%c0, %c0] : memref<?x?xf32> 494 } 495 } 496 return 497} 498 499// ----- 500 501// CHECK-LABEL: func @vec_rejected_11 502func.func @vec_rejected_11(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) { 503 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 504 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 505 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 506 // CHECK-DAG: [[ARG_M:%[0-9]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32> 507 // CHECK-DAG: [[ARG_N:%[0-9]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32> 508 // CHECK-DAG: [[ARG_P:%[0-9]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32> 509 %c0 = arith.constant 0 : index 510 %c1 = arith.constant 1 : index 511 %c2 = arith.constant 2 : index 512 %M = memref.dim %A, %c0 : memref<?x?xf32> 513 %N = memref.dim %A, %c1 : memref<?x?xf32> 514 %P = memref.dim %B, %c2 : memref<?x?x?xf32> 515 516 // CHECK: for [[IV10:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 517 // CHECK: for [[IV11:%[arg0-9]*]] = 0 to %{{[0-9]*}} { 518 // This is similar to vec_rejected_5, but the order of indices is different. 519 affine.for %i10 = 0 to %M { // not vectorized 520 affine.for %i11 = 0 to %N { // not vectorized 521 %a11 = affine.load %A[%i11, %i10] : memref<?x?xf32> 522 affine.store %a11, %A[%i10, %i11] : memref<?x?xf32> 523 } 524 } 525 return 526} 527 528// ----- 529 530// This should not vectorize due to the sequential dependence in the loop. 531// CHECK-LABEL: @vec_rejected_sequential 532func.func @vec_rejected_sequential(%A : memref<?xf32>) { 533 %c0 = arith.constant 0 : index 534 %N = memref.dim %A, %c0 : memref<?xf32> 535 affine.for %i = 0 to %N { 536 // CHECK-NOT: vector 537 %a = affine.load %A[%i] : memref<?xf32> 538 // CHECK-NOT: vector 539 affine.store %a, %A[%i + 1] : memref<?xf32> 540 } 541 return 542} 543 544// ----- 545 546// CHECK-LABEL: @vec_no_load_store_ops 547func.func @vec_no_load_store_ops(%a: f32, %b: f32) { 548 %cst = arith.constant 0.000000e+00 : f32 549 affine.for %i = 0 to 128 { 550 %add = arith.addf %a, %b : f32 551 } 552 // CHECK-DAG: %[[bc1:.*]] = vector.broadcast 553 // CHECK-DAG: %[[bc0:.*]] = vector.broadcast 554 // CHECK: affine.for %{{.*}} = 0 to 128 step 555 // CHECK-NEXT: [[add:.*]] arith.addf %[[bc0]], %[[bc1]] 556 557 return 558} 559 560// ----- 561 562// This should not be vectorized due to the unsupported block argument (%i). 563// Support for operands with linear evolution is needed. 564// CHECK-LABEL: @vec_rejected_unsupported_block_arg 565func.func @vec_rejected_unsupported_block_arg(%A : memref<512xi32>) { 566 affine.for %i = 0 to 512 { 567 // CHECK-NOT: vector 568 %idx = arith.index_cast %i : index to i32 569 affine.store %idx, %A[%i] : memref<512xi32> 570 } 571 return 572} 573 574// ----- 575 576// '%i' loop is vectorized, including the inner reduction over '%j'. 577 578func.func @vec_non_vecdim_reduction(%in: memref<128x256xf32>, %out: memref<256xf32>) { 579 %cst = arith.constant 0.000000e+00 : f32 580 affine.for %i = 0 to 256 { 581 %final_red = affine.for %j = 0 to 128 iter_args(%red_iter = %cst) -> (f32) { 582 %ld = affine.load %in[%j, %i] : memref<128x256xf32> 583 %add = arith.addf %red_iter, %ld : f32 584 affine.yield %add : f32 585 } 586 affine.store %final_red, %out[%i] : memref<256xf32> 587 } 588 return 589} 590 591// CHECK-LABEL: @vec_non_vecdim_reduction 592// CHECK: affine.for %{{.*}} = 0 to 256 step 128 { 593// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 594// CHECK: %[[final_red:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 595// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32> 596// CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[ld]] : vector<128xf32> 597// CHECK: affine.yield %[[add]] : vector<128xf32> 598// CHECK: } 599// CHECK: vector.transfer_write %[[final_red]], %{{.*}} : vector<128xf32>, memref<256xf32> 600// CHECK: } 601 602// ----- 603 604// '%i' loop is vectorized, including the inner reductions over '%j'. 605 606func.func @vec_non_vecdim_reductions(%in0: memref<128x256xf32>, %in1: memref<128x256xi32>, 607 %out0: memref<256xf32>, %out1: memref<256xi32>) { 608 %zero = arith.constant 0.000000e+00 : f32 609 %one = arith.constant 1 : i32 610 affine.for %i = 0 to 256 { 611 %red0, %red1 = affine.for %j = 0 to 128 612 iter_args(%red_iter0 = %zero, %red_iter1 = %one) -> (f32, i32) { 613 %ld0 = affine.load %in0[%j, %i] : memref<128x256xf32> 614 %add = arith.addf %red_iter0, %ld0 : f32 615 %ld1 = affine.load %in1[%j, %i] : memref<128x256xi32> 616 %mul = arith.muli %red_iter1, %ld1 : i32 617 affine.yield %add, %mul : f32, i32 618 } 619 affine.store %red0, %out0[%i] : memref<256xf32> 620 affine.store %red1, %out1[%i] : memref<256xi32> 621 } 622 return 623} 624 625// CHECK-LABEL: @vec_non_vecdim_reductions 626// CHECK: affine.for %{{.*}} = 0 to 256 step 128 { 627// CHECK: %[[vone:.*]] = arith.constant dense<1> : vector<128xi32> 628// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 629// CHECK: %[[reds:.*]]:2 = affine.for %{{.*}} = 0 to 128 630// CHECK-SAME: iter_args(%[[red_iter0:.*]] = %[[vzero]], %[[red_iter1:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xi32>) { 631// CHECK: %[[ld0:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32> 632// CHECK: %[[add:.*]] = arith.addf %[[red_iter0]], %[[ld0]] : vector<128xf32> 633// CHECK: %[[ld1:.*]] = vector.transfer_read %{{.*}} : memref<128x256xi32>, vector<128xi32> 634// CHECK: %[[mul:.*]] = arith.muli %[[red_iter1]], %[[ld1]] : vector<128xi32> 635// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xi32> 636// CHECK: } 637// CHECK: vector.transfer_write %[[reds]]#0, %{{.*}} : vector<128xf32>, memref<256xf32> 638// CHECK: vector.transfer_write %[[reds]]#1, %{{.*}} : vector<128xi32>, memref<256xi32> 639// CHECK: } 640 641// ----- 642 643// '%i' loop is vectorized, including the inner last value computation over '%j'. 644 645func.func @vec_no_vecdim_last_value(%in: memref<128x256xf32>, %out: memref<256xf32>) { 646 %cst = arith.constant 0.000000e+00 : f32 647 affine.for %i = 0 to 256 { 648 %last_val = affine.for %j = 0 to 128 iter_args(%last_iter = %cst) -> (f32) { 649 %ld = affine.load %in[%j, %i] : memref<128x256xf32> 650 affine.yield %ld : f32 651 } 652 affine.store %last_val, %out[%i] : memref<256xf32> 653 } 654 return 655} 656 657// CHECK-LABEL: @vec_no_vecdim_last_value 658// CHECK: affine.for %{{.*}} = 0 to 256 step 128 { 659// CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32> 660// CHECK: %[[last_val:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { 661// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32> 662// CHECK: affine.yield %[[ld]] : vector<128xf32> 663// CHECK: } 664// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32> 665// CHECK: } 666 667// ----- 668 669// The inner reduction loop '%j' is not vectorized if we do not request 670// reduction vectorization. 671 672func.func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) { 673 %cst = arith.constant 0.000000e+00 : f32 674 affine.for %i = 0 to 256 { 675 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { 676 %ld = affine.load %in[%i, %j] : memref<256x512xf32> 677 %add = arith.addf %red_iter, %ld : f32 678 affine.yield %add : f32 679 } 680 affine.store %final_red, %out[%i] : memref<256xf32> 681 } 682 return 683} 684 685// CHECK-LABEL: @vec_vecdim_reduction_rejected 686// CHECK-NOT: vector 687