1// RUN: mlir-opt %s --sparse-compiler | \
2// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
3// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
4// RUN: FileCheck %s
5//
6// Do the same run, but now with SIMDization as well. This should not change the outcome.
7//
8// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \
9// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
10// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
11// RUN: FileCheck %s
12
13#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
14#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense"      ] }>
15
16#trait_reduction = {
17  indexing_maps = [
18    affine_map<(i) -> (i)>,  // a
19    affine_map<(i) -> ()>    // x (scalar out)
20  ],
21  iterator_types = ["reduction"],
22  doc = "x += OPER_i a(i)"
23}
24
25// An example of vector reductions.
26module {
27
28  func.func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
29                          %argx: tensor<i32>) -> tensor<i32> {
30    %0 = linalg.generic #trait_reduction
31      ins(%arga: tensor<32xi32, #SV>)
32      outs(%argx: tensor<i32>) {
33        ^bb(%a: i32, %x: i32):
34          %0 = arith.addi %x, %a : i32
35          linalg.yield %0 : i32
36    } -> tensor<i32>
37    return %0 : tensor<i32>
38  }
39
40  func.func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
41                          %argx: tensor<f32>) -> tensor<f32> {
42    %0 = linalg.generic #trait_reduction
43      ins(%arga: tensor<32xf32, #SV>)
44      outs(%argx: tensor<f32>) {
45        ^bb(%a: f32, %x: f32):
46          %0 = arith.addf %x, %a : f32
47          linalg.yield %0 : f32
48    } -> tensor<f32>
49    return %0 : tensor<f32>
50  }
51
52  func.func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
53                           %argx: tensor<i32>) -> tensor<i32> {
54    %0 = linalg.generic #trait_reduction
55      ins(%arga: tensor<32xi32, #DV>)
56      outs(%argx: tensor<i32>) {
57        ^bb(%a: i32, %x: i32):
58          %0 = arith.muli %x, %a : i32
59          linalg.yield %0 : i32
60    } -> tensor<i32>
61    return %0 : tensor<i32>
62  }
63
64  func.func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
65                           %argx: tensor<f32>) -> tensor<f32> {
66    %0 = linalg.generic #trait_reduction
67      ins(%arga: tensor<32xf32, #DV>)
68      outs(%argx: tensor<f32>) {
69        ^bb(%a: f32, %x: f32):
70          %0 = arith.mulf %x, %a : f32
71          linalg.yield %0 : f32
72    } -> tensor<f32>
73    return %0 : tensor<f32>
74  }
75
76  func.func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
77                          %argx: tensor<i32>) -> tensor<i32> {
78    %0 = linalg.generic #trait_reduction
79      ins(%arga: tensor<32xi32, #DV>)
80      outs(%argx: tensor<i32>) {
81        ^bb(%a: i32, %x: i32):
82          %0 = arith.andi %x, %a : i32
83          linalg.yield %0 : i32
84    } -> tensor<i32>
85    return %0 : tensor<i32>
86  }
87
88  func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
89                         %argx: tensor<i32>) -> tensor<i32> {
90    %0 = linalg.generic #trait_reduction
91      ins(%arga: tensor<32xi32, #SV>)
92      outs(%argx: tensor<i32>) {
93        ^bb(%a: i32, %x: i32):
94          %0 = arith.ori %x, %a : i32
95          linalg.yield %0 : i32
96    } -> tensor<i32>
97    return %0 : tensor<i32>
98  }
99
100  func.func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
101                          %argx: tensor<i32>) -> tensor<i32> {
102    %0 = linalg.generic #trait_reduction
103      ins(%arga: tensor<32xi32, #SV>)
104      outs(%argx: tensor<i32>) {
105        ^bb(%a: i32, %x: i32):
106          %0 = arith.xori %x, %a : i32
107          linalg.yield %0 : i32
108    } -> tensor<i32>
109    return %0 : tensor<i32>
110  }
111
112  func.func @dump_i32(%arg0 : memref<i32>) {
113    %v = memref.load %arg0[] : memref<i32>
114    vector.print %v : i32
115    return
116  }
117
118  func.func @dump_f32(%arg0 : memref<f32>) {
119    %v = memref.load %arg0[] : memref<f32>
120    vector.print %v : f32
121    return
122  }
123
124  func.func @entry() {
125    %ri = arith.constant dense< 7   > : tensor<i32>
126    %rf = arith.constant dense< 2.0 > : tensor<f32>
127
128    %c_0_i32 = arith.constant dense<[
129      0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
130      0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
131    ]> : tensor<32xi32>
132
133    %c_0_f32 = arith.constant dense<[
134      0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
135      0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
136      0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
137      2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
138    ]> : tensor<32xf32>
139
140    %c_1_i32 = arith.constant dense<[
141      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
143    ]> : tensor<32xi32>
144
145    %c_1_f32 = arith.constant dense<[
146      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
147      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
148      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
149      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
150    ]> : tensor<32xf32>
151
152    // Convert constants to annotated tensors.
153    %sparse_input_i32 = sparse_tensor.convert %c_0_i32
154      : tensor<32xi32> to tensor<32xi32, #SV>
155    %sparse_input_f32 = sparse_tensor.convert %c_0_f32
156      : tensor<32xf32> to tensor<32xf32, #SV>
157    %dense_input_i32 = sparse_tensor.convert %c_1_i32
158      : tensor<32xi32> to tensor<32xi32, #DV>
159    %dense_input_f32 = sparse_tensor.convert %c_1_f32
160      : tensor<32xf32> to tensor<32xf32, #DV>
161
162    // Call the kernels.
163    %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
164       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
165    %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
166       : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
167    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
168       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
169    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
170       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
171    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
172       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
173    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
174       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
175    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
176       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
177
178    // Verify results.
179    //
180    // CHECK: 26
181    // CHECK: 27.5
182    // CHECK: 3087
183    // CHECK: 168
184    // CHECK: 1
185    // CHECK: 15
186    // CHECK: 10
187    //
188    %m0 = bufferization.to_memref %0 : memref<i32>
189    call @dump_i32(%m0) : (memref<i32>) -> ()
190    %m1 = bufferization.to_memref %1 : memref<f32>
191    call @dump_f32(%m1) : (memref<f32>) -> ()
192    %m2 = bufferization.to_memref %2 : memref<i32>
193    call @dump_i32(%m2) : (memref<i32>) -> ()
194    %m3 = bufferization.to_memref %3 : memref<f32>
195    call @dump_f32(%m3) : (memref<f32>) -> ()
196    %m4 = bufferization.to_memref %4 : memref<i32>
197    call @dump_i32(%m4) : (memref<i32>) -> ()
198    %m5 = bufferization.to_memref %5 : memref<i32>
199    call @dump_i32(%m5) : (memref<i32>) -> ()
200    %m6 = bufferization.to_memref %6 : memref<i32>
201    call @dump_i32(%m6) : (memref<i32>) -> ()
202
203    // Release the resources.
204    sparse_tensor.release %sparse_input_i32 : tensor<32xi32, #SV>
205    sparse_tensor.release %sparse_input_f32 : tensor<32xf32, #SV>
206    sparse_tensor.release %dense_input_i32  : tensor<32xi32, #DV>
207    sparse_tensor.release %dense_input_f32  : tensor<32xf32, #DV>
208    memref.dealloc %m0 : memref<i32>
209    memref.dealloc %m1 : memref<f32>
210    memref.dealloc %m2 : memref<i32>
211    memref.dealloc %m3 : memref<f32>
212    memref.dealloc %m4 : memref<i32>
213    memref.dealloc %m5 : memref<i32>
214    memref.dealloc %m6 : memref<i32>
215
216    return
217  }
218}
219