1d86ed7fbStbbdev /*
2*c21e688aSSergey Zheltov     Copyright (c) 2005-2022 Intel Corporation
3d86ed7fbStbbdev 
4d86ed7fbStbbdev     Licensed under the Apache License, Version 2.0 (the "License");
5d86ed7fbStbbdev     you may not use this file except in compliance with the License.
6d86ed7fbStbbdev     You may obtain a copy of the License at
7d86ed7fbStbbdev 
8d86ed7fbStbbdev         http://www.apache.org/licenses/LICENSE-2.0
9d86ed7fbStbbdev 
10d86ed7fbStbbdev     Unless required by applicable law or agreed to in writing, software
11d86ed7fbStbbdev     distributed under the License is distributed on an "AS IS" BASIS,
12d86ed7fbStbbdev     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13d86ed7fbStbbdev     See the License for the specific language governing permissions and
14d86ed7fbStbbdev     limitations under the License.
15d86ed7fbStbbdev */
16d86ed7fbStbbdev 
17d86ed7fbStbbdev /* Example program that computes Fibonacci numbers in different ways.
18d86ed7fbStbbdev    Arguments are: [ Number [Threads [Repeats]]]
19d86ed7fbStbbdev    The defaults are Number=500 Threads=1:4 Repeats=1.
20d86ed7fbStbbdev 
21d86ed7fbStbbdev    The point of this program is to check that the library is working properly.
22d86ed7fbStbbdev    Most of the computations are deliberately silly and not expected to
23d86ed7fbStbbdev    show any speedup on multiprocessors.
24d86ed7fbStbbdev */
25d86ed7fbStbbdev 
26d86ed7fbStbbdev // enable assertions
27d86ed7fbStbbdev #ifdef NDEBUG
28d86ed7fbStbbdev #undef NDEBUG
29d86ed7fbStbbdev #endif
30d86ed7fbStbbdev 
31d86ed7fbStbbdev #include <cstdio>
32d86ed7fbStbbdev #include <cstdlib>
33d86ed7fbStbbdev #include <cassert>
34d86ed7fbStbbdev 
35d86ed7fbStbbdev #include <utility>
36d86ed7fbStbbdev #include <thread>
37d86ed7fbStbbdev #include <atomic>
38d86ed7fbStbbdev #include <mutex>
39d86ed7fbStbbdev 
40d86ed7fbStbbdev #include "oneapi/tbb/tick_count.h"
41d86ed7fbStbbdev #include "oneapi/tbb/blocked_range.h"
42d86ed7fbStbbdev #include "oneapi/tbb/concurrent_vector.h"
43d86ed7fbStbbdev #include "oneapi/tbb/concurrent_queue.h"
44d86ed7fbStbbdev #include "oneapi/tbb/concurrent_hash_map.h"
45d86ed7fbStbbdev #include "oneapi/tbb/parallel_for.h"
46d86ed7fbStbbdev #include "oneapi/tbb/parallel_reduce.h"
47d86ed7fbStbbdev #include "oneapi/tbb/parallel_scan.h"
48d86ed7fbStbbdev #include "oneapi/tbb/parallel_pipeline.h"
49d86ed7fbStbbdev #include "oneapi/tbb/spin_mutex.h"
50d86ed7fbStbbdev #include "oneapi/tbb/queuing_mutex.h"
51d86ed7fbStbbdev #include "oneapi/tbb/global_control.h"
52d86ed7fbStbbdev 
53d86ed7fbStbbdev //! type used for Fibonacci number computations
54d86ed7fbStbbdev typedef long long value;
55d86ed7fbStbbdev 
56d86ed7fbStbbdev //! Matrix 2x2 class
57d86ed7fbStbbdev struct Matrix2x2 {
58d86ed7fbStbbdev     //! Array of values
59d86ed7fbStbbdev     value v[2][2];
Matrix2x2Matrix2x260d86ed7fbStbbdev     Matrix2x2() {}
Matrix2x2Matrix2x261d86ed7fbStbbdev     Matrix2x2(value v00, value v01, value v10, value v11) {
62d86ed7fbStbbdev         v[0][0] = v00;
63d86ed7fbStbbdev         v[0][1] = v01;
64d86ed7fbStbbdev         v[1][0] = v10;
65d86ed7fbStbbdev         v[1][1] = v11;
66d86ed7fbStbbdev     }
67d86ed7fbStbbdev     Matrix2x2 operator*(const Matrix2x2 &to) const; //< Multiply two Matrices
68d86ed7fbStbbdev };
69d86ed7fbStbbdev //! Identity matrix
70d86ed7fbStbbdev static const Matrix2x2 MatrixIdentity(1, 0, 0, 1);
71d86ed7fbStbbdev //! Default matrix to multiply
72d86ed7fbStbbdev static const Matrix2x2 Matrix1110(1, 1, 1, 0);
73d86ed7fbStbbdev //! Raw arrays matrices multiply
74d86ed7fbStbbdev void Matrix2x2Multiply(const value a[2][2], const value b[2][2], value c[2][2]);
75d86ed7fbStbbdev 
76d86ed7fbStbbdev /////////////////////// Serial methods ////////////////////////
77d86ed7fbStbbdev 
78d86ed7fbStbbdev //! Plain serial sum
SerialFib(int n)79d86ed7fbStbbdev value SerialFib(int n) {
80d86ed7fbStbbdev     if (n < 2)
81d86ed7fbStbbdev         return n;
82d86ed7fbStbbdev     value a = 0, b = 1, sum;
83d86ed7fbStbbdev     int i;
84d86ed7fbStbbdev     for (i = 2; i <= n; i++) { // n is really index of Fibonacci number
85d86ed7fbStbbdev         sum = a + b;
86d86ed7fbStbbdev         a = b;
87d86ed7fbStbbdev         b = sum;
88d86ed7fbStbbdev     }
89d86ed7fbStbbdev     return sum;
90d86ed7fbStbbdev }
91d86ed7fbStbbdev //! Serial n-1 matrices multiplication
SerialMatrixFib(int n)92d86ed7fbStbbdev value SerialMatrixFib(int n) {
93d86ed7fbStbbdev     value c[2][2], a[2][2] = { { 1, 1 }, { 1, 0 } }, b[2][2] = { { 1, 1 }, { 1, 0 } };
94d86ed7fbStbbdev     int i;
95d86ed7fbStbbdev     for (i = 2; i < n; i++) { // Using condition to prevent copying of values
96d86ed7fbStbbdev         if (i & 1)
97d86ed7fbStbbdev             Matrix2x2Multiply(a, c, b);
98d86ed7fbStbbdev         else
99d86ed7fbStbbdev             Matrix2x2Multiply(a, b, c);
100d86ed7fbStbbdev     }
101d86ed7fbStbbdev     return (i & 1) ? c[0][0] : b[0][0]; // get result from upper left cell
102d86ed7fbStbbdev }
103d86ed7fbStbbdev //! Recursive summing. Just for complete list of serial algorithms, not used
SerialRecursiveFib(int n)104d86ed7fbStbbdev value SerialRecursiveFib(int n) {
105d86ed7fbStbbdev     value result;
106d86ed7fbStbbdev     if (n < 2)
107d86ed7fbStbbdev         result = n;
108d86ed7fbStbbdev     else
109d86ed7fbStbbdev         result = SerialRecursiveFib(n - 1) + SerialRecursiveFib(n - 2);
110d86ed7fbStbbdev     return result;
111d86ed7fbStbbdev }
112b15aabb3Stbbdev 
113b15aabb3Stbbdev // GCC 4.8 C++ standard library implements std::this_thread::yield as no-op.
114b15aabb3Stbbdev #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
yield()115b15aabb3Stbbdev static inline void yield() {
116b15aabb3Stbbdev     sched_yield();
117b15aabb3Stbbdev }
118b15aabb3Stbbdev #else
119b15aabb3Stbbdev using std::this_thread::yield;
120b15aabb3Stbbdev #endif
121b15aabb3Stbbdev 
122d86ed7fbStbbdev //! Introducing of queue method in serial
SerialQueueFib(int n)123d86ed7fbStbbdev value SerialQueueFib(int n) {
124d86ed7fbStbbdev     oneapi::tbb::concurrent_queue<Matrix2x2> Q;
125d86ed7fbStbbdev     for (int i = 1; i < n; i++)
126d86ed7fbStbbdev         Q.push(Matrix1110);
127d86ed7fbStbbdev     Matrix2x2 A, B;
128d86ed7fbStbbdev     while (true) {
129d86ed7fbStbbdev         while (!Q.try_pop(A))
130b15aabb3Stbbdev             yield();
131d86ed7fbStbbdev         if (Q.empty())
132d86ed7fbStbbdev             break;
133d86ed7fbStbbdev         while (!Q.try_pop(B))
134b15aabb3Stbbdev             yield();
135d86ed7fbStbbdev         Q.push(A * B);
136d86ed7fbStbbdev     }
137d86ed7fbStbbdev     return A.v[0][0];
138d86ed7fbStbbdev }
139d86ed7fbStbbdev //! Trying to use concurrent_vector
SerialVectorFib(int n)140d86ed7fbStbbdev value SerialVectorFib(int n) {
141d86ed7fbStbbdev     oneapi::tbb::concurrent_vector<value> A;
142d86ed7fbStbbdev     A.grow_by(2);
143d86ed7fbStbbdev     A[0] = 0;
144d86ed7fbStbbdev     A[1] = 1;
145d86ed7fbStbbdev     for (int i = 2; i <= n; i++) {
146d86ed7fbStbbdev         A.grow_to_at_least(i + 1);
147d86ed7fbStbbdev         A[i] = A[i - 1] + A[i - 2];
148d86ed7fbStbbdev     }
149d86ed7fbStbbdev     return A[n];
150d86ed7fbStbbdev }
151d86ed7fbStbbdev 
152d86ed7fbStbbdev ///////////////////// Parallel methods ////////////////////////
153d86ed7fbStbbdev 
154d86ed7fbStbbdev // *** Serial shared by mutexes *** //
155d86ed7fbStbbdev 
156d86ed7fbStbbdev //! Shared glabals
157d86ed7fbStbbdev value SharedA = 0, SharedB = 1;
158d86ed7fbStbbdev int SharedI = 1, SharedN;
159d86ed7fbStbbdev 
160d86ed7fbStbbdev //! Template task class which computes Fibonacci numbers with shared globals
161d86ed7fbStbbdev template <typename M>
162d86ed7fbStbbdev class SharedSerialFibBody {
163d86ed7fbStbbdev     M &mutex;
164d86ed7fbStbbdev 
165d86ed7fbStbbdev public:
SharedSerialFibBody(M & m)166d86ed7fbStbbdev     SharedSerialFibBody(M &m) : mutex(m) {}
167d86ed7fbStbbdev     //! main loop
operator ()(const oneapi::tbb::blocked_range<int> & range) const168d86ed7fbStbbdev     void operator()(const oneapi::tbb::blocked_range<int> &range) const {
169d86ed7fbStbbdev         for (;;) {
170d86ed7fbStbbdev             typename M::scoped_lock lock(mutex);
171d86ed7fbStbbdev             if (SharedI >= SharedN)
172d86ed7fbStbbdev                 break;
173d86ed7fbStbbdev             value sum = SharedA + SharedB;
174d86ed7fbStbbdev             SharedA = SharedB;
175d86ed7fbStbbdev             SharedB = sum;
176d86ed7fbStbbdev             ++SharedI;
177d86ed7fbStbbdev         }
178d86ed7fbStbbdev     }
179d86ed7fbStbbdev };
180d86ed7fbStbbdev 
181d86ed7fbStbbdev template <>
operator ()(const oneapi::tbb::blocked_range<int> & range) const182d86ed7fbStbbdev void SharedSerialFibBody<std::mutex>::operator()(
183d86ed7fbStbbdev     const oneapi::tbb::blocked_range<int> &range) const {
184d86ed7fbStbbdev     for (;;) {
185d86ed7fbStbbdev         std::lock_guard<std::mutex> lock(mutex);
186d86ed7fbStbbdev         if (SharedI >= SharedN)
187d86ed7fbStbbdev             break;
188d86ed7fbStbbdev         value sum = SharedA + SharedB;
189d86ed7fbStbbdev         SharedA = SharedB;
190d86ed7fbStbbdev         SharedB = sum;
191d86ed7fbStbbdev         ++SharedI;
192d86ed7fbStbbdev     }
193d86ed7fbStbbdev }
194d86ed7fbStbbdev 
195d86ed7fbStbbdev //! Root function
196d86ed7fbStbbdev template <class M>
SharedSerialFib(int n)197d86ed7fbStbbdev value SharedSerialFib(int n) {
198d86ed7fbStbbdev     SharedA = 0;
199d86ed7fbStbbdev     SharedB = 1;
200d86ed7fbStbbdev     SharedI = 1;
201d86ed7fbStbbdev     SharedN = n;
202d86ed7fbStbbdev     M mutex;
203d86ed7fbStbbdev     parallel_for(oneapi::tbb::blocked_range<int>(0, 4, 1), SharedSerialFibBody<M>(mutex));
204d86ed7fbStbbdev     return SharedB;
205d86ed7fbStbbdev }
206d86ed7fbStbbdev 
207d86ed7fbStbbdev // *** Serial shared by concurrent hash map *** //
208d86ed7fbStbbdev 
209d86ed7fbStbbdev //! Hash comparer
210d86ed7fbStbbdev struct IntHashCompare {
equalIntHashCompare211d86ed7fbStbbdev     bool equal(const int j, const int k) const {
212d86ed7fbStbbdev         return j == k;
213d86ed7fbStbbdev     }
hashIntHashCompare2144a23d002Skboyarinov     std::size_t hash(const int k) const {
2154a23d002Skboyarinov         return (std::size_t)k;
216d86ed7fbStbbdev     }
217d86ed7fbStbbdev };
218d86ed7fbStbbdev //! NumbersTable type based on concurrent_hash_map
219d86ed7fbStbbdev typedef oneapi::tbb::concurrent_hash_map<int, value, IntHashCompare> NumbersTable;
220d86ed7fbStbbdev //! task for serial method using shared concurrent_hash_map
221d86ed7fbStbbdev class ConcurrentHashSerialFibTask {
222d86ed7fbStbbdev     NumbersTable &Fib;
223d86ed7fbStbbdev     int my_n;
224d86ed7fbStbbdev 
225d86ed7fbStbbdev public:
226d86ed7fbStbbdev     //! constructor
ConcurrentHashSerialFibTask(NumbersTable & cht,int n)227d86ed7fbStbbdev     ConcurrentHashSerialFibTask(NumbersTable &cht, int n) : Fib(cht), my_n(n) {}
228d86ed7fbStbbdev     //! executing task
operator ()() const229d86ed7fbStbbdev     void operator()() const {
230d86ed7fbStbbdev         for (int i = 2; i <= my_n; ++i) { // there is no difference in to recycle or to make loop
231d86ed7fbStbbdev             NumbersTable::const_accessor f1, f2; // same as iterators
232d86ed7fbStbbdev             if (!Fib.find(f1, i - 1) || !Fib.find(f2, i - 2)) {
233d86ed7fbStbbdev                 // Something is seriously wrong, because i-1 and i-2 must have been inserted
234d86ed7fbStbbdev                 // earlier by this thread or another thread.
235d86ed7fbStbbdev                 assert(0);
236d86ed7fbStbbdev             }
237d86ed7fbStbbdev             value sum = f1->second + f2->second;
238d86ed7fbStbbdev             NumbersTable::const_accessor fsum;
239d86ed7fbStbbdev             Fib.insert(fsum, std::make_pair(i, sum)); // inserting
240d86ed7fbStbbdev             assert(fsum->second == sum); // check value
241d86ed7fbStbbdev         }
242d86ed7fbStbbdev     }
243d86ed7fbStbbdev };
244d86ed7fbStbbdev 
245d86ed7fbStbbdev //! Root function
ConcurrentHashSerialFib(int n)246d86ed7fbStbbdev value ConcurrentHashSerialFib(int n) {
247d86ed7fbStbbdev     NumbersTable Fib;
248d86ed7fbStbbdev     bool okay;
249d86ed7fbStbbdev     okay = Fib.insert(std::make_pair(0, 0));
250d86ed7fbStbbdev     assert(okay); // assign initial values
251d86ed7fbStbbdev     okay = Fib.insert(std::make_pair(1, 1));
252d86ed7fbStbbdev     assert(okay);
253d86ed7fbStbbdev 
254d86ed7fbStbbdev     // task_list list;
255d86ed7fbStbbdev     oneapi::tbb::task_group tg;
256d86ed7fbStbbdev     // allocate tasks
257d86ed7fbStbbdev     tg.run(ConcurrentHashSerialFibTask(Fib, n));
258d86ed7fbStbbdev     tg.run(ConcurrentHashSerialFibTask(Fib, n));
259d86ed7fbStbbdev     tg.wait();
260d86ed7fbStbbdev     NumbersTable::const_accessor fresult;
261d86ed7fbStbbdev     okay = Fib.find(fresult, n);
262d86ed7fbStbbdev     assert(okay);
263d86ed7fbStbbdev     return fresult->second;
264d86ed7fbStbbdev }
265d86ed7fbStbbdev 
266d86ed7fbStbbdev // *** Queue with parallel_pipeline *** //
267d86ed7fbStbbdev 
268d86ed7fbStbbdev typedef oneapi::tbb::concurrent_queue<Matrix2x2> queue_t;
269d86ed7fbStbbdev namespace parallel_pipeline_ns {
270d86ed7fbStbbdev std::atomic<int> N; //< index of Fibonacci number minus 1
271d86ed7fbStbbdev queue_t Queue;
272d86ed7fbStbbdev } // namespace parallel_pipeline_ns
273d86ed7fbStbbdev 
274d86ed7fbStbbdev //! functor to fills queue
275d86ed7fbStbbdev struct InputFunc {
InputFuncInputFunc276d86ed7fbStbbdev     InputFunc() {}
operator ()InputFunc277d86ed7fbStbbdev     queue_t *operator()(oneapi::tbb::flow_control &fc) const {
278d86ed7fbStbbdev         using namespace parallel_pipeline_ns;
279d86ed7fbStbbdev 
280d86ed7fbStbbdev         int n = --N;
281d86ed7fbStbbdev         if (n <= 0) {
282d86ed7fbStbbdev             fc.stop();
283d86ed7fbStbbdev             return nullptr;
284d86ed7fbStbbdev         }
285d86ed7fbStbbdev         Queue.push(Matrix1110);
286d86ed7fbStbbdev         return &Queue;
287d86ed7fbStbbdev     }
288d86ed7fbStbbdev };
289d86ed7fbStbbdev //! functor to process queue
290d86ed7fbStbbdev struct MultiplyFunc {
MultiplyFuncMultiplyFunc291d86ed7fbStbbdev     MultiplyFunc() {}
operator ()MultiplyFunc292d86ed7fbStbbdev     void operator()(queue_t *queue) const {
293d86ed7fbStbbdev         //concurrent_queue<Matrix2x2> &Queue = *static_cast<concurrent_queue<Matrix2x2> *>(p);
294d86ed7fbStbbdev         Matrix2x2 m1, m2;
295d86ed7fbStbbdev         // get two elements
296d86ed7fbStbbdev         while (!queue->try_pop(m1))
297b15aabb3Stbbdev             yield();
298d86ed7fbStbbdev         while (!queue->try_pop(m2))
299b15aabb3Stbbdev             yield();
300d86ed7fbStbbdev         m1 = m1 * m2; // process them
301d86ed7fbStbbdev         queue->push(m1); // and push back
302d86ed7fbStbbdev     }
303d86ed7fbStbbdev };
304d86ed7fbStbbdev //! Root function
ParallelPipeFib(int n)305d86ed7fbStbbdev value ParallelPipeFib(int n) {
306d86ed7fbStbbdev     using namespace parallel_pipeline_ns;
307d86ed7fbStbbdev 
308d86ed7fbStbbdev     N = n - 1;
309d86ed7fbStbbdev     Queue.push(Matrix1110);
310d86ed7fbStbbdev 
311d86ed7fbStbbdev     oneapi::tbb::parallel_pipeline(
312d86ed7fbStbbdev         n,
313d86ed7fbStbbdev         oneapi::tbb::make_filter<void, queue_t *>(oneapi::tbb::filter_mode::parallel, InputFunc()) &
314d86ed7fbStbbdev             oneapi::tbb::make_filter<queue_t *, void>(oneapi::tbb::filter_mode::parallel,
315d86ed7fbStbbdev                                                       MultiplyFunc()));
316d86ed7fbStbbdev 
317d86ed7fbStbbdev     assert(Queue.unsafe_size() == 1);
318d86ed7fbStbbdev     Matrix2x2 M;
319d86ed7fbStbbdev     bool result = Queue.try_pop(M); // get last element
320d86ed7fbStbbdev     assert(result);
321d86ed7fbStbbdev     value res = M.v[0][0]; // get value
322d86ed7fbStbbdev     Queue.clear();
323d86ed7fbStbbdev     return res;
324d86ed7fbStbbdev }
325d86ed7fbStbbdev 
326d86ed7fbStbbdev // *** parallel_reduce *** //
327d86ed7fbStbbdev 
328d86ed7fbStbbdev //! Functor for parallel_reduce
329d86ed7fbStbbdev struct parallel_reduceFibBody {
330d86ed7fbStbbdev     Matrix2x2 sum;
331d86ed7fbStbbdev     int split_flag; //< flag to make one less operation for split bodies
332d86ed7fbStbbdev     //! Constructor fills sum with initial matrix
parallel_reduceFibBodyparallel_reduceFibBody333d86ed7fbStbbdev     parallel_reduceFibBody() : sum(Matrix1110), split_flag(0) {}
334d86ed7fbStbbdev     //! Splitting constructor
parallel_reduceFibBodyparallel_reduceFibBody335d86ed7fbStbbdev     parallel_reduceFibBody(parallel_reduceFibBody &other, oneapi::tbb::split)
336d86ed7fbStbbdev             : sum(Matrix1110),
337d86ed7fbStbbdev               split_flag(1 /*note that it is split*/) {}
338d86ed7fbStbbdev     //! Join point
joinparallel_reduceFibBody339d86ed7fbStbbdev     void join(parallel_reduceFibBody &s) {
340d86ed7fbStbbdev         sum = sum * s.sum;
341d86ed7fbStbbdev     }
342d86ed7fbStbbdev     //! Process multiplications
operator ()parallel_reduceFibBody343d86ed7fbStbbdev     void operator()(const oneapi::tbb::blocked_range<int> &r) {
344d86ed7fbStbbdev         for (int k = r.begin() + split_flag; k < r.end(); ++k)
345d86ed7fbStbbdev             sum = sum * Matrix1110;
346d86ed7fbStbbdev         split_flag = 0; // reset flag, because this method can be reused for next range
347d86ed7fbStbbdev     }
348d86ed7fbStbbdev };
349d86ed7fbStbbdev //! Root function
parallel_reduceFib(int n)350d86ed7fbStbbdev value parallel_reduceFib(int n) {
351d86ed7fbStbbdev     parallel_reduceFibBody b;
352d86ed7fbStbbdev     oneapi::tbb::parallel_reduce(oneapi::tbb::blocked_range<int>(2, n, 3),
353d86ed7fbStbbdev                                  b); // do parallel reduce on range [2, n) for b
354d86ed7fbStbbdev     return b.sum.v[0][0];
355d86ed7fbStbbdev }
356d86ed7fbStbbdev 
357d86ed7fbStbbdev // *** parallel_scan *** //
358d86ed7fbStbbdev 
359d86ed7fbStbbdev //! Functor for parallel_scan
360d86ed7fbStbbdev struct parallel_scanFibBody {
361d86ed7fbStbbdev     /** Though parallel_scan is usually used to accumulate running sums,
362d86ed7fbStbbdev         it can be used to accumulate running products too. */
363d86ed7fbStbbdev     Matrix2x2 product;
364d86ed7fbStbbdev     /** Pointer to output sequence */
365d86ed7fbStbbdev     value *const output;
366d86ed7fbStbbdev     //! Constructor sets product to identity matrix
parallel_scanFibBodyparallel_scanFibBody367d86ed7fbStbbdev     parallel_scanFibBody(value *output_) : product(MatrixIdentity), output(output_) {}
368d86ed7fbStbbdev     //! Splitting constructor
parallel_scanFibBodyparallel_scanFibBody369d86ed7fbStbbdev     parallel_scanFibBody(parallel_scanFibBody &b, oneapi::tbb::split)
370d86ed7fbStbbdev             : product(MatrixIdentity),
371d86ed7fbStbbdev               output(b.output) {}
372d86ed7fbStbbdev     //! Method for merging summary information from a, which was split off from *this, into *this.
reverse_joinparallel_scanFibBody373d86ed7fbStbbdev     void reverse_join(parallel_scanFibBody &a) {
374d86ed7fbStbbdev         // When using non-commutative reduction operation, reverse_join
375d86ed7fbStbbdev         // should put argument "a" on the left side of the operation.
376d86ed7fbStbbdev         // The reversal from the argument order is why the method is
377d86ed7fbStbbdev         // called "reverse_join" instead of "join".
378d86ed7fbStbbdev         product = a.product * product;
379d86ed7fbStbbdev     }
380d86ed7fbStbbdev     //! Method for assigning final result back to original body.
assignparallel_scanFibBody381d86ed7fbStbbdev     void assign(parallel_scanFibBody &b) {
382d86ed7fbStbbdev         product = b.product;
383d86ed7fbStbbdev     }
384d86ed7fbStbbdev     //! Compute matrix running product.
385d86ed7fbStbbdev     /** Tag indicates whether is is the final scan over the range, or
386d86ed7fbStbbdev         just a helper "prescan" that is computing a partial reduction. */
387d86ed7fbStbbdev     template <typename Tag>
operator ()parallel_scanFibBody388d86ed7fbStbbdev     void operator()(const oneapi::tbb::blocked_range<int> &r, Tag tag) {
389d86ed7fbStbbdev         for (int k = r.begin(); k < r.end(); ++k) {
390d86ed7fbStbbdev             // Code performs an "exclusive" scan, which outputs a value *before* updating the product.
391d86ed7fbStbbdev             // For an "inclusive" scan, output the value after the update.
392d86ed7fbStbbdev             if (tag.is_final_scan())
393d86ed7fbStbbdev                 output[k] = product.v[0][1];
394d86ed7fbStbbdev             product = product * Matrix1110;
395d86ed7fbStbbdev         }
396d86ed7fbStbbdev     }
397d86ed7fbStbbdev };
398d86ed7fbStbbdev //! Root function
parallel_scanFib(int n)399d86ed7fbStbbdev value parallel_scanFib(int n) {
400d86ed7fbStbbdev     value *output = new value[n];
401d86ed7fbStbbdev     parallel_scanFibBody b(output);
402d86ed7fbStbbdev     oneapi::tbb::parallel_scan(oneapi::tbb::blocked_range<int>(0, n, 3), b);
403d86ed7fbStbbdev     // output[0..n-1] now contains the Fibonacci sequence (modulo integer wrap-around).
404d86ed7fbStbbdev     // Check the last two values for correctness.
405d86ed7fbStbbdev     assert(n < 2 || output[n - 2] + output[n - 1] == b.product.v[0][1]);
406d86ed7fbStbbdev     delete[] output;
407d86ed7fbStbbdev     return b.product.v[0][1];
408d86ed7fbStbbdev }
409d86ed7fbStbbdev 
410d86ed7fbStbbdev /////////////////////////// Main ////////////////////////////////////////////////////
411d86ed7fbStbbdev 
412d86ed7fbStbbdev //! A closed range of int.
413d86ed7fbStbbdev struct IntRange {
414d86ed7fbStbbdev     int low;
415d86ed7fbStbbdev     int high;
416d86ed7fbStbbdev     void set_from_string(const char *s);
IntRangeIntRange417d86ed7fbStbbdev     IntRange(int low_, int high_) : low(low_), high(high_) {}
418d86ed7fbStbbdev };
419d86ed7fbStbbdev 
set_from_string(const char * s)420d86ed7fbStbbdev void IntRange::set_from_string(const char *s) {
421d86ed7fbStbbdev     char *end;
422d86ed7fbStbbdev     high = low = strtol(s, &end, 0);
423d86ed7fbStbbdev     switch (*end) {
42457f524caSIlya Isaev         case ':': high = strtol(end + 1, nullptr, 0); break;
425d86ed7fbStbbdev         case '\0': break;
426d86ed7fbStbbdev         default: printf("unexpected character = %c\n", *end);
427d86ed7fbStbbdev     }
428d86ed7fbStbbdev }
429d86ed7fbStbbdev 
430d86ed7fbStbbdev //! Tick count for start
431d86ed7fbStbbdev static oneapi::tbb::tick_count t0;
432d86ed7fbStbbdev 
433d86ed7fbStbbdev //! Verbose output flag
434d86ed7fbStbbdev static bool Verbose = false;
435d86ed7fbStbbdev 
436d86ed7fbStbbdev typedef value (*MeasureFunc)(int);
437d86ed7fbStbbdev //! Measure ticks count in loop [2..n]
Measure(const char * name,MeasureFunc func,int n)438d86ed7fbStbbdev value Measure(const char *name, MeasureFunc func, int n) {
439d86ed7fbStbbdev     value result;
440d86ed7fbStbbdev     if (Verbose)
441d86ed7fbStbbdev         printf("%s", name);
442d86ed7fbStbbdev     t0 = oneapi::tbb::tick_count::now();
443d86ed7fbStbbdev     for (int number = 2; number <= n; number++)
444d86ed7fbStbbdev         result = func(number);
445d86ed7fbStbbdev     if (Verbose)
446d86ed7fbStbbdev         printf("\t- in %f msec\n", (oneapi::tbb::tick_count::now() - t0).seconds() * 1000);
447d86ed7fbStbbdev     return result;
448d86ed7fbStbbdev }
449d86ed7fbStbbdev 
450d86ed7fbStbbdev //! program entry
main(int argc,char * argv[])451d86ed7fbStbbdev int main(int argc, char *argv[]) {
452d86ed7fbStbbdev     if (argc > 1)
453d86ed7fbStbbdev         Verbose = true;
45457f524caSIlya Isaev     int NumbersCount = argc > 1 ? strtol(argv[1], nullptr, 0) : 500;
455d86ed7fbStbbdev     IntRange NThread(1, 4); // Number of threads to use.
456d86ed7fbStbbdev     if (argc > 2)
457d86ed7fbStbbdev         NThread.set_from_string(argv[2]);
45857f524caSIlya Isaev     unsigned long ntrial = argc > 3 ? (unsigned long)strtoul(argv[3], nullptr, 0) : 1;
459d86ed7fbStbbdev     value result, sum;
460d86ed7fbStbbdev 
461d86ed7fbStbbdev     if (Verbose)
462d86ed7fbStbbdev         printf("Fibonacci numbers example. Generating %d numbers..\n", NumbersCount);
463d86ed7fbStbbdev 
464d86ed7fbStbbdev     result = Measure("Serial loop", SerialFib, NumbersCount);
465d86ed7fbStbbdev     sum = Measure("Serial matrix", SerialMatrixFib, NumbersCount);
466d86ed7fbStbbdev     assert(result == sum);
467d86ed7fbStbbdev     sum = Measure("Serial vector", SerialVectorFib, NumbersCount);
468d86ed7fbStbbdev     assert(result == sum);
469d86ed7fbStbbdev     sum = Measure("Serial queue", SerialQueueFib, NumbersCount);
470d86ed7fbStbbdev     assert(result == sum);
471d86ed7fbStbbdev     // now in parallel
472d86ed7fbStbbdev     for (unsigned long i = 0; i < ntrial; ++i) {
473d86ed7fbStbbdev         for (int threads = NThread.low; threads <= NThread.high; threads *= 2) {
474d86ed7fbStbbdev             oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism,
475d86ed7fbStbbdev                                           threads);
476d86ed7fbStbbdev             if (Verbose)
477d86ed7fbStbbdev                 printf("\nThreads number is %d\n", threads);
478d86ed7fbStbbdev 
479d86ed7fbStbbdev             sum = Measure("Shared serial (mutex)\t", SharedSerialFib<std::mutex>, NumbersCount);
480d86ed7fbStbbdev             assert(result == sum);
481d86ed7fbStbbdev             sum = Measure("Shared serial (spin_mutex)",
482d86ed7fbStbbdev                           SharedSerialFib<oneapi::tbb::spin_mutex>,
483d86ed7fbStbbdev                           NumbersCount);
484d86ed7fbStbbdev             assert(result == sum);
485d86ed7fbStbbdev             sum = Measure("Shared serial (queuing_mutex)",
486d86ed7fbStbbdev                           SharedSerialFib<oneapi::tbb::queuing_mutex>,
487d86ed7fbStbbdev                           NumbersCount);
488d86ed7fbStbbdev             assert(result == sum);
489d86ed7fbStbbdev             sum = Measure("Shared serial (Conc.HashTable)", ConcurrentHashSerialFib, NumbersCount);
490d86ed7fbStbbdev             assert(result == sum);
491d86ed7fbStbbdev             sum = Measure("Parallel pipe/queue\t", ParallelPipeFib, NumbersCount);
492d86ed7fbStbbdev             assert(result == sum);
493d86ed7fbStbbdev             sum = Measure("Parallel reduce\t\t", parallel_reduceFib, NumbersCount);
494d86ed7fbStbbdev             assert(result == sum);
495d86ed7fbStbbdev             sum = Measure("Parallel scan\t\t", parallel_scanFib, NumbersCount);
496d86ed7fbStbbdev             assert(result == sum);
497d86ed7fbStbbdev         }
498d86ed7fbStbbdev 
499d86ed7fbStbbdev #ifdef __GNUC__
500d86ed7fbStbbdev         if (Verbose)
501d86ed7fbStbbdev             printf("Fibonacci number #%d modulo 2^64 is %lld\n\n", NumbersCount, result);
502d86ed7fbStbbdev #else
503d86ed7fbStbbdev         if (Verbose)
504d86ed7fbStbbdev             printf("Fibonacci number #%d modulo 2^64 is %I64d\n\n", NumbersCount, result);
505d86ed7fbStbbdev #endif
506d86ed7fbStbbdev     }
507d86ed7fbStbbdev     if (!Verbose)
508d86ed7fbStbbdev         printf("TEST PASSED\n");
5094523a761Stbbdev     // flush to prevent bufferization on exit
5104523a761Stbbdev     fflush(stdout);
511d86ed7fbStbbdev     return 0;
512d86ed7fbStbbdev }
513d86ed7fbStbbdev 
514d86ed7fbStbbdev // Utils
515d86ed7fbStbbdev 
Matrix2x2Multiply(const value a[2][2],const value b[2][2],value c[2][2])516d86ed7fbStbbdev void Matrix2x2Multiply(const value a[2][2], const value b[2][2], value c[2][2]) {
517d86ed7fbStbbdev     for (int i = 0; i <= 1; i++)
518d86ed7fbStbbdev         for (int j = 0; j <= 1; j++)
519d86ed7fbStbbdev             c[i][j] = a[i][0] * b[0][j] + a[i][1] * b[1][j];
520d86ed7fbStbbdev }
521d86ed7fbStbbdev 
operator *(const Matrix2x2 & to) const522d86ed7fbStbbdev Matrix2x2 Matrix2x2::operator*(const Matrix2x2 &to) const {
523d86ed7fbStbbdev     Matrix2x2 result;
524d86ed7fbStbbdev     Matrix2x2Multiply(v, to.v, result.v);
525d86ed7fbStbbdev     return result;
526d86ed7fbStbbdev }
527