1// RUN: mlir-opt %s --sparse-compiler | \ 2// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ 3// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 4// RUN: FileCheck %s 5// 6// Do the same run, but now with SIMDization as well. This should not change the outcome. 7// 8// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \ 9// RUN: mlir-cpu-runner -e entry -entry-point-result=void \ 10// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ 11// RUN: FileCheck %s 12 13#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }> 14#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> 15 16#trait_reduction = { 17 indexing_maps = [ 18 affine_map<(i) -> (i)>, // a 19 affine_map<(i) -> ()> // x (scalar out) 20 ], 21 iterator_types = ["reduction"], 22 doc = "x += OPER_i a(i)" 23} 24 25// An example of vector reductions. 26module { 27 28 func.func @sum_reduction_i32(%arga: tensor<32xi32, #SV>, 29 %argx: tensor<i32>) -> tensor<i32> { 30 %0 = linalg.generic #trait_reduction 31 ins(%arga: tensor<32xi32, #SV>) 32 outs(%argx: tensor<i32>) { 33 ^bb(%a: i32, %x: i32): 34 %0 = arith.addi %x, %a : i32 35 linalg.yield %0 : i32 36 } -> tensor<i32> 37 return %0 : tensor<i32> 38 } 39 40 func.func @sum_reduction_f32(%arga: tensor<32xf32, #SV>, 41 %argx: tensor<f32>) -> tensor<f32> { 42 %0 = linalg.generic #trait_reduction 43 ins(%arga: tensor<32xf32, #SV>) 44 outs(%argx: tensor<f32>) { 45 ^bb(%a: f32, %x: f32): 46 %0 = arith.addf %x, %a : f32 47 linalg.yield %0 : f32 48 } -> tensor<f32> 49 return %0 : tensor<f32> 50 } 51 52 func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>, 53 %argx: tensor<i32>) -> tensor<i32> { 54 %0 = linalg.generic #trait_reduction 55 ins(%arga: tensor<32xi32, #DV>) 56 outs(%argx: tensor<i32>) { 57 ^bb(%a: i32, %x: i32): 58 %0 = arith.muli %x, %a : i32 59 linalg.yield %0 : i32 60 } -> tensor<i32> 61 return %0 : tensor<i32> 62 } 63 64 func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>, 65 %argx: tensor<f32>) -> tensor<f32> { 66 %0 = linalg.generic #trait_reduction 67 ins(%arga: tensor<32xf32, #DV>) 68 outs(%argx: tensor<f32>) { 69 ^bb(%a: f32, %x: f32): 70 %0 = arith.mulf %x, %a : f32 71 linalg.yield %0 : f32 72 } -> tensor<f32> 73 return %0 : tensor<f32> 74 } 75 76 func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>, 77 %argx: tensor<i32>) -> tensor<i32> { 78 %0 = linalg.generic #trait_reduction 79 ins(%arga: tensor<32xi32, #DV>) 80 outs(%argx: tensor<i32>) { 81 ^bb(%a: i32, %x: i32): 82 %0 = arith.andi %x, %a : i32 83 linalg.yield %0 : i32 84 } -> tensor<i32> 85 return %0 : tensor<i32> 86 } 87 88 func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>, 89 %argx: tensor<i32>) -> tensor<i32> { 90 %0 = linalg.generic #trait_reduction 91 ins(%arga: tensor<32xi32, #SV>) 92 outs(%argx: tensor<i32>) { 93 ^bb(%a: i32, %x: i32): 94 %0 = arith.ori %x, %a : i32 95 linalg.yield %0 : i32 96 } -> tensor<i32> 97 return %0 : tensor<i32> 98 } 99 100 func.func @xor_reduction_i32(%arga: tensor<32xi32, #SV>, 101 %argx: tensor<i32>) -> tensor<i32> { 102 %0 = linalg.generic #trait_reduction 103 ins(%arga: tensor<32xi32, #SV>) 104 outs(%argx: tensor<i32>) { 105 ^bb(%a: i32, %x: i32): 106 %0 = arith.xori %x, %a : i32 107 linalg.yield %0 : i32 108 } -> tensor<i32> 109 return %0 : tensor<i32> 110 } 111 112 func.func @dump_i32(%arg0 : tensor<i32>) { 113 %v = tensor.extract %arg0[] : tensor<i32> 114 vector.print %v : i32 115 return 116 } 117 118 func.func @dump_f32(%arg0 : tensor<f32>) { 119 %v = tensor.extract %arg0[] : tensor<f32> 120 vector.print %v : f32 121 return 122 } 123 124 func.func @entry() { 125 %ri = arith.constant dense< 7 > : tensor<i32> 126 %rf = arith.constant dense< 2.0 > : tensor<f32> 127 128 %c_0_i32 = arith.constant dense<[ 129 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 130 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0 131 ]> : tensor<32xi32> 132 133 %c_0_f32 = arith.constant dense<[ 134 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 135 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 136 0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0, 137 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 138 ]> : tensor<32xf32> 139 140 %c_1_i32 = arith.constant dense<[ 141 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 143 ]> : tensor<32xi32> 144 145 %c_1_f32 = arith.constant dense<[ 146 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, 147 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 148 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 149 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 150 ]> : tensor<32xf32> 151 152 // Convert constants to annotated tensors. 153 %sparse_input_i32 = sparse_tensor.convert %c_0_i32 154 : tensor<32xi32> to tensor<32xi32, #SV> 155 %sparse_input_f32 = sparse_tensor.convert %c_0_f32 156 : tensor<32xf32> to tensor<32xf32, #SV> 157 %dense_input_i32 = sparse_tensor.convert %c_1_i32 158 : tensor<32xi32> to tensor<32xi32, #DV> 159 %dense_input_f32 = sparse_tensor.convert %c_1_f32 160 : tensor<32xf32> to tensor<32xf32, #DV> 161 162 // Call the kernels. 163 %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) 164 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 165 %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) 166 : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 167 %2 = call @prod_reduction_i32(%dense_input_i32, %ri) 168 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 169 %3 = call @prod_reduction_f32(%dense_input_f32, %rf) 170 : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32> 171 %4 = call @and_reduction_i32(%dense_input_i32, %ri) 172 : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 173 %5 = call @or_reduction_i32(%sparse_input_i32, %ri) 174 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 175 %6 = call @xor_reduction_i32(%sparse_input_i32, %ri) 176 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 177 178 // Verify results. 179 // 180 // CHECK: 26 181 // CHECK: 27.5 182 // CHECK: 3087 183 // CHECK: 168 184 // CHECK: 1 185 // CHECK: 15 186 // CHECK: 10 187 // 188 call @dump_i32(%0) : (tensor<i32>) -> () 189 call @dump_f32(%1) : (tensor<f32>) -> () 190 call @dump_i32(%2) : (tensor<i32>) -> () 191 call @dump_f32(%3) : (tensor<f32>) -> () 192 call @dump_i32(%4) : (tensor<i32>) -> () 193 call @dump_i32(%5) : (tensor<i32>) -> () 194 call @dump_i32(%6) : (tensor<i32>) -> () 195 196 // Release the resources. 197 bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV> 198 bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV> 199 bufferization.dealloc_tensor %dense_input_i32 : tensor<32xi32, #DV> 200 bufferization.dealloc_tensor %dense_input_f32 : tensor<32xf32, #DV> 201 202 return 203 } 204} 205