1// RUN: mlir-opt %s \ 2// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ 3// RUN: --sparsification --sparse-tensor-conversion \ 4// RUN: --convert-vector-to-scf --convert-scf-to-std \ 5// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ 6// RUN: --std-bufferize --finalizing-bufferize --lower-affine \ 7// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ 8// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ 9// RUN: mlir-cpu-runner \ 10// RUN: -e entry -entry-point-result=void \ 11// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 12// RUN: FileCheck %s 13// 14// Do the same run, but now with SIMDization as well. This should not change the outcome. 15// 16// RUN: mlir-opt %s \ 17// RUN: --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \ 18// RUN: --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \ 19// RUN: --convert-vector-to-scf --convert-scf-to-std \ 20// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \ 21// RUN: --std-bufferize --finalizing-bufferize --lower-affine \ 22// RUN: --convert-vector-to-llvm --convert-memref-to-llvm \ 23// RUN: --convert-std-to-llvm --reconcile-unrealized-casts | \ 24// RUN: mlir-cpu-runner \ 25// RUN: -e entry -entry-point-result=void \ 26// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 27// RUN: FileCheck %s 28 29#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }> 30#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> 31 32#trait_reduction = { 33 indexing_maps = [ 34 affine_map<(i) -> (i)>, // a 35 affine_map<(i) -> ()> // x (scalar out) 36 ], 37 iterator_types = ["reduction"], 38 doc = "x += OPER_i a(i)" 39} 40 41// An example of vector reductions. 42module { 43 44 func @sum_reduction_i32(%arga: tensor<32xi32, #SV>, 45 %argx: tensor<i32>) -> tensor<i32> { 46 %0 = linalg.generic #trait_reduction 47 ins(%arga: tensor<32xi32, #SV>) 48 outs(%argx: tensor<i32>) { 49 ^bb(%a: i32, %x: i32): 50 %0 = arith.addi %x, %a : i32 51 linalg.yield %0 : i32 52 } -> tensor<i32> 53 return %0 : tensor<i32> 54 } 55 56 func @sum_reduction_f32(%arga: tensor<32xf32, #SV>, 57 %argx: tensor<f32>) -> tensor<f32> { 58 %0 = linalg.generic #trait_reduction 59 ins(%arga: tensor<32xf32, #SV>) 60 outs(%argx: tensor<f32>) { 61 ^bb(%a: f32, %x: f32): 62 %0 = arith.addf %x, %a : f32 63 linalg.yield %0 : f32 64 } -> tensor<f32> 65 return %0 : tensor<f32> 66 } 67 68 func @prod_reduction_i32(%arga: tensor<32xi32, #DV>, 69 %argx: tensor<i32>) -> tensor<i32> { 70 %0 = linalg.generic #trait_reduction 71 ins(%arga: tensor<32xi32, #DV>) 72 outs(%argx: tensor<i32>) { 73 ^bb(%a: i32, %x: i32): 74 %0 = arith.muli %x, %a : i32 75 linalg.yield %0 : i32 76 } -> tensor<i32> 77 return %0 : tensor<i32> 78 } 79 80 func @prod_reduction_f32(%arga: tensor<32xf32, #DV>, 81 %argx: tensor<f32>) -> tensor<f32> { 82 %0 = linalg.generic #trait_reduction 83 ins(%arga: tensor<32xf32, #DV>) 84 outs(%argx: tensor<f32>) { 85 ^bb(%a: f32, %x: f32): 86 %0 = arith.mulf %x, %a : f32 87 linalg.yield %0 : f32 88 } -> tensor<f32> 89 return %0 : tensor<f32> 90 } 91 92 func @and_reduction_i32(%arga: tensor<32xi32, #DV>, 93 %argx: tensor<i32>) -> tensor<i32> { 94 %0 = linalg.generic #trait_reduction 95 ins(%arga: tensor<32xi32, #DV>) 96 outs(%argx: tensor<i32>) { 97 ^bb(%a: i32, %x: i32): 98 %0 = arith.andi %x, %a : i32 99 linalg.yield %0 : i32 100 } -> tensor<i32> 101 return %0 : tensor<i32> 102 } 103 104 func @or_reduction_i32(%arga: tensor<32xi32, #SV>, 105 %argx: tensor<i32>) -> tensor<i32> { 106 %0 = linalg.generic #trait_reduction 107 ins(%arga: tensor<32xi32, #SV>) 108 outs(%argx: tensor<i32>) { 109 ^bb(%a: i32, %x: i32): 110 %0 = arith.ori %x, %a : i32 111 linalg.yield %0 : i32 112 } -> tensor<i32> 113 return %0 : tensor<i32> 114 } 115 116 func @xor_reduction_i32(%arga: tensor<32xi32, #SV>, 117 %argx: tensor<i32>) -> tensor<i32> { 118 %0 = linalg.generic #trait_reduction 119 ins(%arga: tensor<32xi32, #SV>) 120 outs(%argx: tensor<i32>) { 121 ^bb(%a: i32, %x: i32): 122 %0 = arith.xori %x, %a : i32 123 linalg.yield %0 : i32 124 } -> tensor<i32> 125 return %0 : tensor<i32> 126 } 127 128 func @dump_i32(%arg0 : memref<i32>) { 129 %v = memref.load %arg0[] : memref<i32> 130 vector.print %v : i32 131 return 132 } 133 134 func @dump_f32(%arg0 : memref<f32>) { 135 %v = memref.load %arg0[] : memref<f32> 136 vector.print %v : f32 137 return 138 } 139 140 func @entry() { 141 %ri = arith.constant dense< 7 > : tensor<i32> 142 %rf = arith.constant dense< 2.0 > : tensor<f32> 143 144 %c_0_i32 = arith.constant dense<[ 145 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 146 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0 147 ]> : tensor<32xi32> 148 149 %c_0_f32 = arith.constant dense<[ 150 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 151 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 152 0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0, 153 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 154 ]> : tensor<32xf32> 155 156 %c_1_i32 = arith.constant dense<[ 157 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 158 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 159 ]> : tensor<32xi32> 160 161 %c_1_f32 = arith.constant dense<[ 162 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, 163 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 164 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 165 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 166 ]> : tensor<32xf32> 167 168 // Convert constants to annotated tensors. 169 %sparse_input_i32 = sparse_tensor.convert %c_0_i32 170 : tensor<32xi32> to tensor<32xi32, #SV> 171 %sparse_input_f32 = sparse_tensor.convert %c_0_f32 172 : tensor<32xf32> to tensor<32xf32, #SV> 173 %dense_input_i32 = sparse_tensor.convert %c_1_i32 174 : tensor<32xi32> to tensor<32xi32, #DV> 175 %dense_input_f32 = sparse_tensor.convert %c_1_f32 176 : tensor<32xf32> to tensor<32xf32, #DV> 177 178 // Call the kernels. 179 %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) 180 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 181 %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) 182 : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 183 %2 = call @prod_reduction_i32(%dense_input_i32, %ri) 184 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 185 %3 = call @prod_reduction_f32(%dense_input_f32, %rf) 186 : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32> 187 %4 = call @and_reduction_i32(%dense_input_i32, %ri) 188 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 189 %5 = call @or_reduction_i32(%sparse_input_i32, %ri) 190 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 191 %6 = call @xor_reduction_i32(%sparse_input_i32, %ri) 192 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 193 194 // Verify results. 195 // 196 // CHECK: 26 197 // CHECK: 27.5 198 // CHECK: 3087 199 // CHECK: 168 200 // CHECK: 1 201 // CHECK: 15 202 // CHECK: 10 203 // 204 %m0 = bufferization.to_memref %0 : memref<i32> 205 call @dump_i32(%m0) : (memref<i32>) -> () 206 %m1 = bufferization.to_memref %1 : memref<f32> 207 call @dump_f32(%m1) : (memref<f32>) -> () 208 %m2 = bufferization.to_memref %2 : memref<i32> 209 call @dump_i32(%m2) : (memref<i32>) -> () 210 %m3 = bufferization.to_memref %3 : memref<f32> 211 call @dump_f32(%m3) : (memref<f32>) -> () 212 %m4 = bufferization.to_memref %4 : memref<i32> 213 call @dump_i32(%m4) : (memref<i32>) -> () 214 %m5 = bufferization.to_memref %5 : memref<i32> 215 call @dump_i32(%m5) : (memref<i32>) -> () 216 %m6 = bufferization.to_memref %6 : memref<i32> 217 call @dump_i32(%m6) : (memref<i32>) -> () 218 219 // Release the resources. 220 sparse_tensor.release %sparse_input_i32 : tensor<32xi32, #SV> 221 sparse_tensor.release %sparse_input_f32 : tensor<32xf32, #SV> 222 sparse_tensor.release %dense_input_i32 : tensor<32xi32, #DV> 223 sparse_tensor.release %dense_input_f32 : tensor<32xf32, #DV> 224 memref.dealloc %m0 : memref<i32> 225 memref.dealloc %m1 : memref<f32> 226 memref.dealloc %m2 : memref<i32> 227 memref.dealloc %m3 : memref<f32> 228 memref.dealloc %m4 : memref<i32> 229 memref.dealloc %m5 : memref<i32> 230 memref.dealloc %m6 : memref<i32> 231 232 return 233 } 234} 235