1// RUN: mlir-opt %s \
2// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
3// RUN:   --sparsification --sparse-tensor-conversion \
4// RUN:   --convert-vector-to-scf --convert-scf-to-std \
5// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
6// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
7// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
8// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
9// RUN: mlir-cpu-runner \
10// RUN:  -e entry -entry-point-result=void  \
11// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
12// RUN: FileCheck %s
13//
14// Do the same run, but now with SIMDization as well. This should not change the outcome.
15//
16// RUN: mlir-opt %s \
17// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
18// RUN:   --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \
19// RUN:   --convert-vector-to-scf --convert-scf-to-std \
20// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
21// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
22// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
23// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
24// RUN: mlir-cpu-runner \
25// RUN:  -e entry -entry-point-result=void  \
26// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
27// RUN: FileCheck %s
28
29#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
30#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense"      ] }>
31
32#trait_reduction = {
33  indexing_maps = [
34    affine_map<(i) -> (i)>,  // a
35    affine_map<(i) -> ()>    // x (scalar out)
36  ],
37  iterator_types = ["reduction"],
38  doc = "x += OPER_i a(i)"
39}
40
41// An example of vector reductions.
42module {
43
44  func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
45                          %argx: tensor<i32>) -> tensor<i32> {
46    %0 = linalg.generic #trait_reduction
47      ins(%arga: tensor<32xi32, #SV>)
48      outs(%argx: tensor<i32>) {
49        ^bb(%a: i32, %x: i32):
50          %0 = arith.addi %x, %a : i32
51          linalg.yield %0 : i32
52    } -> tensor<i32>
53    return %0 : tensor<i32>
54  }
55
56  func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
57                          %argx: tensor<f32>) -> tensor<f32> {
58    %0 = linalg.generic #trait_reduction
59      ins(%arga: tensor<32xf32, #SV>)
60      outs(%argx: tensor<f32>) {
61        ^bb(%a: f32, %x: f32):
62          %0 = arith.addf %x, %a : f32
63          linalg.yield %0 : f32
64    } -> tensor<f32>
65    return %0 : tensor<f32>
66  }
67
68  func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
69                           %argx: tensor<i32>) -> tensor<i32> {
70    %0 = linalg.generic #trait_reduction
71      ins(%arga: tensor<32xi32, #DV>)
72      outs(%argx: tensor<i32>) {
73        ^bb(%a: i32, %x: i32):
74          %0 = arith.muli %x, %a : i32
75          linalg.yield %0 : i32
76    } -> tensor<i32>
77    return %0 : tensor<i32>
78  }
79
80  func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
81                           %argx: tensor<f32>) -> tensor<f32> {
82    %0 = linalg.generic #trait_reduction
83      ins(%arga: tensor<32xf32, #DV>)
84      outs(%argx: tensor<f32>) {
85        ^bb(%a: f32, %x: f32):
86          %0 = arith.mulf %x, %a : f32
87          linalg.yield %0 : f32
88    } -> tensor<f32>
89    return %0 : tensor<f32>
90  }
91
92  func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
93                          %argx: tensor<i32>) -> tensor<i32> {
94    %0 = linalg.generic #trait_reduction
95      ins(%arga: tensor<32xi32, #DV>)
96      outs(%argx: tensor<i32>) {
97        ^bb(%a: i32, %x: i32):
98          %0 = arith.andi %x, %a : i32
99          linalg.yield %0 : i32
100    } -> tensor<i32>
101    return %0 : tensor<i32>
102  }
103
104  func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
105                         %argx: tensor<i32>) -> tensor<i32> {
106    %0 = linalg.generic #trait_reduction
107      ins(%arga: tensor<32xi32, #SV>)
108      outs(%argx: tensor<i32>) {
109        ^bb(%a: i32, %x: i32):
110          %0 = arith.ori %x, %a : i32
111          linalg.yield %0 : i32
112    } -> tensor<i32>
113    return %0 : tensor<i32>
114  }
115
116  func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
117                          %argx: tensor<i32>) -> tensor<i32> {
118    %0 = linalg.generic #trait_reduction
119      ins(%arga: tensor<32xi32, #SV>)
120      outs(%argx: tensor<i32>) {
121        ^bb(%a: i32, %x: i32):
122          %0 = arith.xori %x, %a : i32
123          linalg.yield %0 : i32
124    } -> tensor<i32>
125    return %0 : tensor<i32>
126  }
127
128  func @dump_i32(%arg0 : memref<i32>) {
129    %v = memref.load %arg0[] : memref<i32>
130    vector.print %v : i32
131    return
132  }
133
134  func @dump_f32(%arg0 : memref<f32>) {
135    %v = memref.load %arg0[] : memref<f32>
136    vector.print %v : f32
137    return
138  }
139
140  func @entry() {
141    %ri = arith.constant dense< 7   > : tensor<i32>
142    %rf = arith.constant dense< 2.0 > : tensor<f32>
143
144    %c_0_i32 = arith.constant dense<[
145      0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
146      0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
147    ]> : tensor<32xi32>
148
149    %c_0_f32 = arith.constant dense<[
150      0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
151      0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
152      0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
153      2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
154    ]> : tensor<32xf32>
155
156    %c_1_i32 = arith.constant dense<[
157      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
159    ]> : tensor<32xi32>
160
161    %c_1_f32 = arith.constant dense<[
162      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
163      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
164      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
165      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
166    ]> : tensor<32xf32>
167
168    // Convert constants to annotated tensors.
169    %sparse_input_i32 = sparse_tensor.convert %c_0_i32
170      : tensor<32xi32> to tensor<32xi32, #SV>
171    %sparse_input_f32 = sparse_tensor.convert %c_0_f32
172      : tensor<32xf32> to tensor<32xf32, #SV>
173    %dense_input_i32 = sparse_tensor.convert %c_1_i32
174      : tensor<32xi32> to tensor<32xi32, #DV>
175    %dense_input_f32 = sparse_tensor.convert %c_1_f32
176      : tensor<32xf32> to tensor<32xf32, #DV>
177
178    // Call the kernels.
179    %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
180       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
181    %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
182       : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
183    %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
184       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
185    %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
186       : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
187    %4 = call @and_reduction_i32(%dense_input_i32, %ri)
188       : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
189    %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
190       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
191    %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
192       : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
193
194    // Verify results.
195    //
196    // CHECK: 26
197    // CHECK: 27.5
198    // CHECK: 3087
199    // CHECK: 168
200    // CHECK: 1
201    // CHECK: 15
202    // CHECK: 10
203    //
204    %m0 = bufferization.to_memref %0 : memref<i32>
205    call @dump_i32(%m0) : (memref<i32>) -> ()
206    %m1 = bufferization.to_memref %1 : memref<f32>
207    call @dump_f32(%m1) : (memref<f32>) -> ()
208    %m2 = bufferization.to_memref %2 : memref<i32>
209    call @dump_i32(%m2) : (memref<i32>) -> ()
210    %m3 = bufferization.to_memref %3 : memref<f32>
211    call @dump_f32(%m3) : (memref<f32>) -> ()
212    %m4 = bufferization.to_memref %4 : memref<i32>
213    call @dump_i32(%m4) : (memref<i32>) -> ()
214    %m5 = bufferization.to_memref %5 : memref<i32>
215    call @dump_i32(%m5) : (memref<i32>) -> ()
216    %m6 = bufferization.to_memref %6 : memref<i32>
217    call @dump_i32(%m6) : (memref<i32>) -> ()
218
219    // Release the resources.
220    sparse_tensor.release %sparse_input_i32 : tensor<32xi32, #SV>
221    sparse_tensor.release %sparse_input_f32 : tensor<32xf32, #SV>
222    sparse_tensor.release %dense_input_i32  : tensor<32xi32, #DV>
223    sparse_tensor.release %dense_input_f32  : tensor<32xf32, #DV>
224    memref.dealloc %m0 : memref<i32>
225    memref.dealloc %m1 : memref<f32>
226    memref.dealloc %m2 : memref<i32>
227    memref.dealloc %m3 : memref<f32>
228    memref.dealloc %m4 : memref<i32>
229    memref.dealloc %m5 : memref<i32>
230    memref.dealloc %m6 : memref<i32>
231
232    return
233  }
234}
235