1// RUN: mlir-opt %s --sparse-compiler | \
2// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
3// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
4// RUN: FileCheck %s
5//
6// Do the same run, but now with SIMDization as well. This should not change the outcome.
7//
8// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \
9// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
10// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
11// RUN: FileCheck %s
12
13#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
14#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense"      ] }>
15
16#trait_reduction = {
17  indexing_maps = [
18    affine_map<(i) -> (i)>,  // a
19    affine_map<(i) -> ()>    // x (scalar out)
20  ],
21  iterator_types = ["reduction"],
22  doc = "x += OPER_i a(i)"
23}
24
25// An example of vector reductions.
26module {
27
28  func.func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
29                          %argx: tensor<i32>) -> tensor<i32> {
30    %0 = linalg.generic #trait_reduction
31      ins(%arga: tensor<32xi32, #SV>)
32      outs(%argx: tensor<i32>) {
33        ^bb(%a: i32, %x: i32):
34          %0 = arith.addi %x, %a : i32
35          linalg.yield %0 : i32
36    } -> tensor<i32>
37    return %0 : tensor<i32>
38  }
39
40  func.func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
41                          %argx: tensor<f32>) -> tensor<f32> {
42    %0 = linalg.generic #trait_reduction
43      ins(%arga: tensor<32xf32, #SV>)
44      outs(%argx: tensor<f32>) {
45        ^bb(%a: f32, %x: f32):
46          %0 = arith.addf %x, %a : f32
47          linalg.yield %0 : f32
48    } -> tensor<f32>
49    return %0 : tensor<f32>
50  }
51
52  func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
53                           %argx: tensor<i32>) -> tensor<i32> {
54    %0 = linalg.generic #trait_reduction
55      ins(%arga: tensor<32xi32, #DV>)
56      outs(%argx: tensor<i32>) {
57        ^bb(%a: i32, %x: i32):
58          %0 = arith.muli %x, %a : i32
59          linalg.yield %0 : i32
60    } -> tensor<i32>
61    return %0 : tensor<i32>
62  }
63
64  func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
65                           %argx: tensor<f32>) -> tensor<f32> {
66    %0 = linalg.generic #trait_reduction
67      ins(%arga: tensor<32xf32, #DV>)
68      outs(%argx: tensor<f32>) {
69        ^bb(%a: f32, %x: f32):
70          %0 = arith.mulf %x, %a : f32
71          linalg.yield %0 : f32
72    } -> tensor<f32>
73    return %0 : tensor<f32>
74  }
75
76  func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
77                          %argx: tensor<i32>) -> tensor<i32> {
78    %0 = linalg.generic #trait_reduction
79      ins(%arga: tensor<32xi32, #DV>)
80      outs(%argx: tensor<i32>) {
81        ^bb(%a: i32, %x: i32):
82          %0 = arith.andi %x, %a : i32
83          linalg.yield %0 : i32
84    } -> tensor<i32>
85    return %0 : tensor<i32>
86  }
87
88  func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
89                         %argx: tensor<i32>) -> tensor<i32> {
90    %0 = linalg.generic #trait_reduction
91      ins(%arga: tensor<32xi32, #SV>)
92      outs(%argx: tensor<i32>) {
93        ^bb(%a: i32, %x: i32):
94          %0 = arith.ori %x, %a : i32
95          linalg.yield %0 : i32
96    } -> tensor<i32>
97    return %0 : tensor<i32>
98  }
99
100  func.func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
101                          %argx: tensor<i32>) -> tensor<i32> {
102    %0 = linalg.generic #trait_reduction
103      ins(%arga: tensor<32xi32, #SV>)
104      outs(%argx: tensor<i32>) {
105        ^bb(%a: i32, %x: i32):
106          %0 = arith.xori %x, %a : i32
107          linalg.yield %0 : i32
108    } -> tensor<i32>
109    return %0 : tensor<i32>
110  }
111
112  func.func @dump_i32(%arg0 : tensor<i32>) {
113    %v = tensor.extract %arg0[] : tensor<i32>
114    vector.print %v : i32
115    return
116  }
117
118  func.func @dump_f32(%arg0 : tensor<f32>) {
119    %v = tensor.extract %arg0[] : tensor<f32>
120    vector.print %v : f32
121    return
122  }
123
124  func.func @entry() {
125    %ri = arith.constant dense< 7   > : tensor<i32>
126    %rf = arith.constant dense< 2.0 > : tensor<f32>
127
128    %c_0_i32 = arith.constant dense<[
129      0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
130      0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
131    ]> : tensor<32xi32>
132
133    %c_0_f32 = arith.constant dense<[
134      0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
135      0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
136      0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
137      2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
138    ]> : tensor<32xf32>
139
140    %c_1_i32 = arith.constant dense<[
141      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
143    ]> : tensor<32xi32>
144
145    %c_1_f32 = arith.constant dense<[
146      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
147      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
148      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
149      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
150    ]> : tensor<32xf32>
151
152    // Convert constants to annotated tensors.
153    %sparse_input_i32 = sparse_tensor.convert %c_0_i32
154      : tensor<32xi32> to tensor<32xi32, #SV>
155    %sparse_input_f32 = sparse_tensor.convert %c_0_f32
156      : tensor<32xf32> to tensor<32xf32, #SV>
157    %dense_input_i32 = sparse_tensor.convert %c_1_i32
158      : tensor<32xi32> to tensor<32xi32, #DV>
159    %dense_input_f32 = sparse_tensor.convert %c_1_f32
160      : tensor<32xf32> to tensor<32xf32, #DV>
161
162    // Call the kernels.
163    %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
164       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
165    %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
166       : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
167    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
168       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
169    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
170       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
171    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
172       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
173    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
174       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
175    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
176       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
177
178    // Verify results.
179    //
180    // CHECK: 26
181    // CHECK: 27.5
182    // CHECK: 3087
183    // CHECK: 168
184    // CHECK: 1
185    // CHECK: 15
186    // CHECK: 10
187    //
188    call @dump_i32(%0) : (tensor<i32>) -> ()
189    call @dump_f32(%1) : (tensor<f32>) -> ()
190    call @dump_i32(%2) : (tensor<i32>) -> ()
191    call @dump_f32(%3) : (tensor<f32>) -> ()
192    call @dump_i32(%4) : (tensor<i32>) -> ()
193    call @dump_i32(%5) : (tensor<i32>) -> ()
194    call @dump_i32(%6) : (tensor<i32>) -> ()
195
196    // Release the resources.
197    bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV>
198    bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV>
199    bufferization.dealloc_tensor %dense_input_i32  : tensor<32xi32, #DV>
200    bufferization.dealloc_tensor %dense_input_f32  : tensor<32xf32, #DV>
201
202    return
203  }
204}
205