1// RUN: mlir-opt %s --sparse-compiler | \ 2// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ 3// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 4// RUN: FileCheck %s 5// 6// Do the same run, but now with SIMDization as well. This should not change the outcome. 7// 8// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \ 9// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ 10// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 11// RUN: FileCheck %s 12 13#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }> 14#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> 15 16#trait_reduction = { 17 indexing_maps = [ 18 affine_map<(i) -> (i)>, // a 19 affine_map<(i) -> ()> // x (scalar out) 20 ], 21 iterator_types = ["reduction"], 22 doc = "x += OPER_i a(i)" 23} 24 25// An example of vector reductions. 26module { 27 28 func.func @sum_reduction_i32(%arga: tensor<32xi32, #SV>, 29 %argx: tensor<i32>) -> tensor<i32> { 30 %0 = linalg.generic #trait_reduction 31 ins(%arga: tensor<32xi32, #SV>) 32 outs(%argx: tensor<i32>) { 33 ^bb(%a: i32, %x: i32): 34 %0 = arith.addi %x, %a : i32 35 linalg.yield %0 : i32 36 } -> tensor<i32> 37 return %0 : tensor<i32> 38 } 39 40 func.func @sum_reduction_f32(%arga: tensor<32xf32, #SV>, 41 %argx: tensor<f32>) -> tensor<f32> { 42 %0 = linalg.generic #trait_reduction 43 ins(%arga: tensor<32xf32, #SV>) 44 outs(%argx: tensor<f32>) { 45 ^bb(%a: f32, %x: f32): 46 %0 = arith.addf %x, %a : f32 47 linalg.yield %0 : f32 48 } -> tensor<f32> 49 return %0 : tensor<f32> 50 } 51 52 func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>, 53 %argx: tensor<i32>) -> tensor<i32> { 54 %0 = linalg.generic #trait_reduction 55 ins(%arga: tensor<32xi32, #DV>) 56 outs(%argx: tensor<i32>) { 57 ^bb(%a: i32, %x: i32): 58 %0 = arith.muli %x, %a : i32 59 linalg.yield %0 : i32 60 } -> tensor<i32> 61 return %0 : tensor<i32> 62 } 63 64 func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>, 65 %argx: tensor<f32>) -> tensor<f32> { 66 %0 = linalg.generic #trait_reduction 67 ins(%arga: tensor<32xf32, #DV>) 68 outs(%argx: tensor<f32>) { 69 ^bb(%a: f32, %x: f32): 70 %0 = arith.mulf %x, %a : f32 71 linalg.yield %0 : f32 72 } -> tensor<f32> 73 return %0 : tensor<f32> 74 } 75 76 func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>, 77 %argx: tensor<i32>) -> tensor<i32> { 78 %0 = linalg.generic #trait_reduction 79 ins(%arga: tensor<32xi32, #DV>) 80 outs(%argx: tensor<i32>) { 81 ^bb(%a: i32, %x: i32): 82 %0 = arith.andi %x, %a : i32 83 linalg.yield %0 : i32 84 } -> tensor<i32> 85 return %0 : tensor<i32> 86 } 87 88 func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>, 89 %argx: tensor<i32>) -> tensor<i32> { 90 %0 = linalg.generic #trait_reduction 91 ins(%arga: tensor<32xi32, #SV>) 92 outs(%argx: tensor<i32>) { 93 ^bb(%a: i32, %x: i32): 94 %0 = arith.ori %x, %a : i32 95 linalg.yield %0 : i32 96 } -> tensor<i32> 97 return %0 : tensor<i32> 98 } 99 100 func.func @xor_reduction_i32(%arga: tensor<32xi32, #SV>, 101 %argx: tensor<i32>) -> tensor<i32> { 102 %0 = linalg.generic #trait_reduction 103 ins(%arga: tensor<32xi32, #SV>) 104 outs(%argx: tensor<i32>) { 105 ^bb(%a: i32, %x: i32): 106 %0 = arith.xori %x, %a : i32 107 linalg.yield %0 : i32 108 } -> tensor<i32> 109 return %0 : tensor<i32> 110 } 111 112 func.func @dump_i32(%arg0 : memref<i32>) { 113 %v = memref.load %arg0[] : memref<i32> 114 vector.print %v : i32 115 return 116 } 117 118 func.func @dump_f32(%arg0 : memref<f32>) { 119 %v = memref.load %arg0[] : memref<f32> 120 vector.print %v : f32 121 return 122 } 123 124 func.func @entry() { 125 %ri = arith.constant dense< 7 > : tensor<i32> 126 %rf = arith.constant dense< 2.0 > : tensor<f32> 127 128 %c_0_i32 = arith.constant dense<[ 129 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 130 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0 131 ]> : tensor<32xi32> 132 133 %c_0_f32 = arith.constant dense<[ 134 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 135 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 136 0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0, 137 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 138 ]> : tensor<32xf32> 139 140 %c_1_i32 = arith.constant dense<[ 141 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 143 ]> : tensor<32xi32> 144 145 %c_1_f32 = arith.constant dense<[ 146 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, 147 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 148 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 149 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 150 ]> : tensor<32xf32> 151 152 // Convert constants to annotated tensors. 153 %sparse_input_i32 = sparse_tensor.convert %c_0_i32 154 : tensor<32xi32> to tensor<32xi32, #SV> 155 %sparse_input_f32 = sparse_tensor.convert %c_0_f32 156 : tensor<32xf32> to tensor<32xf32, #SV> 157 %dense_input_i32 = sparse_tensor.convert %c_1_i32 158 : tensor<32xi32> to tensor<32xi32, #DV> 159 %dense_input_f32 = sparse_tensor.convert %c_1_f32 160 : tensor<32xf32> to tensor<32xf32, #DV> 161 162 // Call the kernels. 163 %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) 164 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 165 %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) 166 : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 167 %2 = call @prod_reduction_i32(%dense_input_i32, %ri) 168 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 169 %3 = call @prod_reduction_f32(%dense_input_f32, %rf) 170 : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32> 171 %4 = call @and_reduction_i32(%dense_input_i32, %ri) 172 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 173 %5 = call @or_reduction_i32(%sparse_input_i32, %ri) 174 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 175 %6 = call @xor_reduction_i32(%sparse_input_i32, %ri) 176 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 177 178 // Verify results. 179 // 180 // CHECK: 26 181 // CHECK: 27.5 182 // CHECK: 3087 183 // CHECK: 168 184 // CHECK: 1 185 // CHECK: 15 186 // CHECK: 10 187 // 188 %m0 = bufferization.to_memref %0 : memref<i32> 189 call @dump_i32(%m0) : (memref<i32>) -> () 190 %m1 = bufferization.to_memref %1 : memref<f32> 191 call @dump_f32(%m1) : (memref<f32>) -> () 192 %m2 = bufferization.to_memref %2 : memref<i32> 193 call @dump_i32(%m2) : (memref<i32>) -> () 194 %m3 = bufferization.to_memref %3 : memref<f32> 195 call @dump_f32(%m3) : (memref<f32>) -> () 196 %m4 = bufferization.to_memref %4 : memref<i32> 197 call @dump_i32(%m4) : (memref<i32>) -> () 198 %m5 = bufferization.to_memref %5 : memref<i32> 199 call @dump_i32(%m5) : (memref<i32>) -> () 200 %m6 = bufferization.to_memref %6 : memref<i32> 201 call @dump_i32(%m6) : (memref<i32>) -> () 202 203 // Release the resources. 204 sparse_tensor.release %sparse_input_i32 : tensor<32xi32, #SV> 205 sparse_tensor.release %sparse_input_f32 : tensor<32xf32, #SV> 206 sparse_tensor.release %dense_input_i32 : tensor<32xi32, #DV> 207 sparse_tensor.release %dense_input_f32 : tensor<32xf32, #DV> 208 memref.dealloc %m0 : memref<i32> 209 memref.dealloc %m1 : memref<f32> 210 memref.dealloc %m2 : memref<i32> 211 memref.dealloc %m3 : memref<f32> 212 memref.dealloc %m4 : memref<i32> 213 memref.dealloc %m5 : memref<i32> 214 memref.dealloc %m6 : memref<i32> 215 216 return 217 } 218} 219