1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #define DOCTEST_CONFIG_SUPER_FAST_ASSERTS
18 #include "common/test.h"
19 #include "common/utils.h"
20 #include "common/utils_report.h"
21 #include "common/test_invoke.h"
22 
23 #include "oneapi/tbb/parallel_for.h"
24 #include "oneapi/tbb/tick_count.h"
25 
26 #include "../tbb/test_partitioner.h"
27 
28 #include <atomic>
29 
30 //! \file conformance_parallel_for.cpp
31 //! \brief Test for [algorithms.parallel_for algorithms.auto_partitioner algorithms.simple_partitioner algorithms.static_partitioner algorithms.affinity_partitioner] specification
32 
33 static const int N = 500;
34 static std::atomic<int> Array[N];
35 
36 struct parallel_tag {};
37 struct empty_partitioner_tag {};
38 
39 // Testing parallel_for with step support
40 const std::size_t PFOR_BUFFER_TEST_SIZE = 1024;
41 // test_buffer has some extra items beyond its right bound
42 const std::size_t PFOR_BUFFER_ACTUAL_SIZE = PFOR_BUFFER_TEST_SIZE + 1024;
43 size_t pfor_buffer[PFOR_BUFFER_ACTUAL_SIZE];
44 
45 template<typename T>
46 class TestFunctor{
47 public:
48     void operator ()(T index) const {
49         pfor_buffer[index]++;
50     }
51 };
52 
53 static std::atomic<int> FooBodyCount;
54 
55 // A range object whose only public members are those required by the Range concept.
56 template<size_t Pad>
57 class FooRange {
58     // Start of range
59     int start;
60 
61     // Size of range
62     int size;
63     FooRange( int start_, int size_ ) : start(start_), size(size_) {
64         utils::zero_fill<char>(pad, Pad);
65         pad[Pad-1] = 'x';
66     }
67     template<typename Flavor_, std::size_t Pad_> friend void Flog( );
68     template<size_t Pad_> friend class FooBody;
69     void operator&();
70 
71     char pad[Pad];
72 public:
73     bool empty() const {return size==0;}
74     bool is_divisible() const {return size>1;}
75     FooRange( FooRange& original, oneapi::tbb::split ) : size(original.size/2) {
76         original.size -= size;
77         start = original.start+original.size;
78         CHECK_FAST( original.pad[Pad-1]=='x');
79         pad[Pad-1] = 'x';
80     }
81 };
82 
83 // A range object whose only public members are those required by the parallel_for.h body concept.
84 template<size_t Pad>
85 class FooBody {
86 public:
87     ~FooBody() {
88         --FooBodyCount;
89         for( std::size_t i=0; i<sizeof(*this); ++i )
90             reinterpret_cast<char*>(this)[i] = -1;
91     }
92     // Copy constructor
93     FooBody( const FooBody& other ) : array(other.array), state(other.state) {
94         ++FooBodyCount;
95         CHECK_FAST(state == LIVE);
96     }
97     void operator()( FooRange<Pad>& r ) const {
98         for (int k = r.start; k < r.start + r.size; ++k) {
99             CHECK_FAST(array[k].load(std::memory_order_relaxed) == 0);
100             array[k].store(1, std::memory_order_relaxed);
101         }
102     }
103 private:
104     const int LIVE = 0x1234;
105     std::atomic<int>* array;
106     int state;
107     friend class FooRange<Pad>;
108     template<typename Flavor_, std::size_t Pad_> friend void Flog( );
109     FooBody( std::atomic<int>* array_ ) : array(array_), state(LIVE) {}
110 };
111 
112 template <typename Flavor, typename Partitioner, typename Range, typename Body>
113 struct Invoker;
114 
115 template <typename Range, typename Body>
116 struct Invoker<parallel_tag, empty_partitioner_tag, Range, Body> {
117     void operator()( const Range& r, const Body& body, empty_partitioner_tag& ) {
118         oneapi::tbb::parallel_for( r, body );
119     }
120 };
121 
122 template <typename Partitioner, typename Range, typename Body>
123 struct Invoker<parallel_tag, Partitioner, Range, Body> {
124     void operator()( const Range& r, const Body& body, Partitioner& p ) {
125         oneapi::tbb::parallel_for( r, body, p );
126     }
127 };
128 
129 template <typename Flavor, typename Partitioner, typename T, typename Body>
130 struct InvokerStep;
131 
132 template <typename T, typename Body>
133 struct InvokerStep<parallel_tag, empty_partitioner_tag, T, Body> {
134     void operator()( const T& first, const T& last, const Body& f, empty_partitioner_tag& ) {
135         oneapi::tbb::parallel_for( first, last, f );
136     }
137     void operator()( const T& first, const T& last, const T& step, const Body& f, empty_partitioner_tag& ) {
138         oneapi::tbb::parallel_for( first, last, step, f );
139     }
140 };
141 
142 template <typename Partitioner, typename T, typename Body>
143 struct InvokerStep<parallel_tag, Partitioner, T, Body> {
144     void operator()( const T& first, const T& last, const Body& f, Partitioner& p ) {
145         oneapi::tbb::parallel_for( first, last, f, p );
146     }
147     void operator()( const T& first, const T& last, const T& step, const Body& f, Partitioner& p ) {
148         oneapi::tbb::parallel_for( first, last, step, f, p );
149     }
150 };
151 
152 template<typename Flavor, std::size_t Pad>
153 void Flog() {
154     for ( int i=0; i<N; ++i ) {
155         for ( int mode = 0; mode < 4; ++mode) {
156             FooRange<Pad> r( 0, i );
157             const FooRange<Pad> rc = r;
158             FooBody<Pad> f( Array );
159             const FooBody<Pad> fc = f;
160             for (int a_i = 0; a_i < N; a_i++) {
161                 Array[a_i].store(0, std::memory_order_relaxed);
162             }
163             FooBodyCount = 1;
164             switch (mode) {
165             case 0: {
166                 empty_partitioner_tag p;
167                 Invoker< Flavor, empty_partitioner_tag, FooRange<Pad>, FooBody<Pad> > invoke_for;
168                 invoke_for( rc, fc, p );
169             }
170                 break;
171             case 1: {
172                 Invoker< Flavor, const oneapi::tbb::simple_partitioner, FooRange<Pad>, FooBody<Pad> > invoke_for;
173                 invoke_for( rc, fc, oneapi::tbb::simple_partitioner() );
174             }
175                 break;
176             case 2: {
177                 Invoker< Flavor, const oneapi::tbb::auto_partitioner, FooRange<Pad>, FooBody<Pad> > invoke_for;
178                 invoke_for( rc, fc, oneapi::tbb::auto_partitioner() );
179             }
180                 break;
181             case 3: {
182                 static oneapi::tbb::affinity_partitioner affinity;
183                 Invoker< Flavor, oneapi::tbb::affinity_partitioner, FooRange<Pad>, FooBody<Pad> > invoke_for;
184                 invoke_for( rc, fc, affinity );
185             }
186                 break;
187             }
188             CHECK(std::find_if_not(Array, Array + i, [](const std::atomic<int>& v) { return v.load(std::memory_order_relaxed) == 1; }) == Array + i);
189             CHECK(std::find_if_not(Array + i, Array + N, [](const std::atomic<int>& v) { return v.load(std::memory_order_relaxed) == 0; }) == Array + N);
190             CHECK(FooBodyCount == 1);
191         }
192     }
193 }
194 
195 #include <stdexcept> // std::invalid_argument
196 
197 template <typename Flavor, typename T, typename Partitioner>
198 void TestParallelForWithStepSupportHelper(Partitioner& p) {
199     const T pfor_buffer_test_size = static_cast<T>(PFOR_BUFFER_TEST_SIZE);
200     const T pfor_buffer_actual_size = static_cast<T>(PFOR_BUFFER_ACTUAL_SIZE);
201     // Testing parallel_for with different step values
202     InvokerStep< Flavor, Partitioner, T, TestFunctor<T> > invoke_for;
203     for (T begin = 0; begin < pfor_buffer_test_size - 1; begin += pfor_buffer_test_size / 10 + 1) {
204         T step;
205         for (step = 1; step < pfor_buffer_test_size; step++) {
206             std::memset(pfor_buffer, 0, pfor_buffer_actual_size * sizeof(std::size_t));
207             if (step == 1){
208                 invoke_for(begin, pfor_buffer_test_size, TestFunctor<T>(), p);
209             } else {
210                 invoke_for(begin, pfor_buffer_test_size, step, TestFunctor<T>(), p);
211             }
212             // Verifying that parallel_for processed all items it should
213             for (T i = begin; i < pfor_buffer_test_size; i = i + step) {
214                 if (pfor_buffer[i] != 1) {
215                     CHECK_MESSAGE(false, "parallel_for didn't process all required elements");
216                 }
217                 pfor_buffer[i] = 0;
218             }
219             // Verifying that no extra items were processed and right bound of array wasn't crossed
220             for (T i = 0; i < pfor_buffer_actual_size; i++) {
221                 if (pfor_buffer[i] != 0) {
222                     CHECK_MESSAGE(false, "parallel_for processed an extra element");
223                 }
224             }
225         }
226     }
227 }
228 
229 template <typename Flavor, typename T>
230 void TestParallelForWithStepSupport() {
231     static oneapi::tbb::affinity_partitioner affinity_p;
232     oneapi::tbb::auto_partitioner auto_p;
233     oneapi::tbb::simple_partitioner simple_p;
234     oneapi::tbb::static_partitioner static_p;
235     empty_partitioner_tag p;
236 
237     // Try out all partitioner combinations
238     TestParallelForWithStepSupportHelper< Flavor,T,empty_partitioner_tag >(p);
239     TestParallelForWithStepSupportHelper< Flavor,T,const oneapi::tbb::auto_partitioner >(auto_p);
240     TestParallelForWithStepSupportHelper< Flavor,T,const oneapi::tbb::simple_partitioner >(simple_p);
241     TestParallelForWithStepSupportHelper< Flavor,T,oneapi::tbb::affinity_partitioner >(affinity_p);
242     TestParallelForWithStepSupportHelper< Flavor,T,oneapi::tbb::static_partitioner >(static_p);
243 
244     // Testing some corner cases
245     oneapi::tbb::parallel_for(static_cast<T>(2), static_cast<T>(1), static_cast<T>(1), TestFunctor<T>());
246 }
247 
248 #if __TBB_CPP17_INVOKE_PRESENT
249 class SmartIndex {
250 public:
251     SmartIndex(std::size_t ri) : real_index(ri), change_vector(nullptr) {}
252     SmartIndex(std::size_t ri, std::vector<std::size_t>& cv)
253         : real_index(ri), change_vector(&cv) {}
254     SmartIndex(const SmartIndex& other) : real_index(other.real_index),
255                                           change_vector(other.change_vector) {}
256     ~SmartIndex() = default;
257 
258     SmartIndex& operator=(const SmartIndex& other) {
259         real_index = other.real_index;
260         change_vector = other.change_vector;
261         return *this;
262     }
263 
264     bool operator<(const SmartIndex& other) const {
265         return real_index < other.real_index;
266     }
267 
268     bool operator<=(const SmartIndex& other) const {
269         return real_index <= other.real_index;
270     }
271 
272     SmartIndex operator/(const SmartIndex& other) const {
273         return {real_index / other.real_index, *change_vector};
274     }
275 
276     SmartIndex operator*(const SmartIndex& other) const {
277         return {real_index * other.real_index, *change_vector};
278     }
279 
280     SmartIndex operator+(const SmartIndex& other) const {
281         return {real_index + other.real_index, *change_vector};
282     }
283 
284     SmartIndex& operator+=(const SmartIndex& other) {
285         real_index += other.real_index;
286         return *this;
287     }
288 
289     SmartIndex& operator++() { ++real_index; return *this; }
290 
291     std::size_t operator-(const SmartIndex& other) const {
292         return real_index - other.real_index;
293     }
294 
295     SmartIndex operator+(std::size_t k) {
296         return {real_index + k, *change_vector};
297     }
298 
299     void increase() const {
300         CHECK(change_vector);
301         ++(*change_vector)[real_index];
302     }
303 private:
304     std::size_t real_index;
305     std::vector<std::size_t>* change_vector;
306 };
307 
308 void test_pfor_body_invoke() {
309     const std::size_t number_of_overloads = 5;
310     const std::size_t iterations = 100000;
311 
312     using range_type = test_invoke::SmartRange<std::size_t>;
313     std::vector<std::size_t> change_vector(iterations, 0);
314     range_type range{0, iterations, change_vector};
315 
316     oneapi::tbb::parallel_for(range, &range_type::increase);
317     oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::simple_partitioner());
318     oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::auto_partitioner());
319     oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::static_partitioner());
320     oneapi::tbb::affinity_partitioner aff;
321     oneapi::tbb::parallel_for(range, &range_type::increase, aff);
322 
323     for (std::size_t item : change_vector) {
324         CHECK(item == number_of_overloads);
325     }
326 }
327 
328 
329 void test_pfor_func_invoke() {
330     const std::size_t number_of_overloads = 5;
331     const std::size_t iterations = 100000;
332 
333     std::vector<std::size_t> change_vector(iterations, 0);
334     SmartIndex first{0, change_vector};
335     SmartIndex last{iterations, change_vector};
336     SmartIndex stride{2};
337 
338     oneapi::tbb::parallel_for(first, last, &SmartIndex::increase);
339     oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::simple_partitioner());
340     oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::auto_partitioner());
341     oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::static_partitioner());
342     oneapi::tbb::affinity_partitioner aff;
343     oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, aff);
344 
345     for (std::size_t& item : change_vector) {
346         CHECK(item == number_of_overloads);
347         item = 0;
348     }
349 
350     oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase);
351     oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::simple_partitioner());
352     oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::auto_partitioner());
353     oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::static_partitioner());
354     oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, aff);
355 
356     CHECK(change_vector[0] == number_of_overloads);
357     for (std::size_t i = 1; i < iterations; ++i) {
358         std::size_t expected = change_vector[i - 1] == 0 ? number_of_overloads : 0;
359         CHECK(change_vector[i] == expected);
360     }
361 }
362 #endif // __TBB_CPP17_INVOKE_PRESENT
363 
364 //! Test simple parallel_for with different partitioners
365 //! \brief \ref interface \ref requirement
366 TEST_CASE("Basic parallel_for") {
367     std::atomic<unsigned long> counter{};
368     const std::size_t number_of_partitioners = 5;
369     const std::size_t iterations = 100000;
370 
371     oneapi::tbb::parallel_for(std::size_t(0), iterations, [&](std::size_t) {
372         counter++;
373     });
374 
375     oneapi::tbb::parallel_for(std::size_t(0), iterations, [&](std::size_t) {
376         counter++;
377     }, oneapi::tbb::simple_partitioner());
378 
379     oneapi::tbb::parallel_for(std::size_t(0), iterations, [&](std::size_t) {
380         counter++;
381     }, oneapi::tbb::auto_partitioner());
382 
383     oneapi::tbb::parallel_for(std::size_t(0), iterations, [&](std::size_t) {
384         counter++;
385     }, oneapi::tbb::static_partitioner());
386 
387     oneapi::tbb::affinity_partitioner aff;
388     oneapi::tbb::parallel_for(std::size_t(0), iterations, [&](std::size_t) {
389         counter++;
390     }, aff);
391 
392     CHECK_EQ(counter.load(std::memory_order_relaxed), iterations * number_of_partitioners);
393 }
394 
395 //! Testing parallel for with different partitioners and ranges ranges
396 //! \brief \ref interface \ref requirement \ref stress
397 TEST_CASE("Flog test") {
398     Flog<parallel_tag, 1>();
399     Flog<parallel_tag, 10>();
400     Flog<parallel_tag, 100>();
401     Flog<parallel_tag, 1000>();
402     Flog<parallel_tag, 10000>();
403 }
404 
405 //! Testing parallel for with different types and step
406 //! \brief \ref interface \ref requirement
407 TEST_CASE_TEMPLATE("parallel_for with step support", T, short, unsigned short, int, unsigned int,
408                                     long, unsigned long, long long, unsigned long long, std::size_t) {
409     // Testing with different integer types
410     TestParallelForWithStepSupport<parallel_tag, T>();
411 }
412 
413 //! Testing with different types of ranges and partitioners
414 //! \brief \ref interface \ref requirement
415 TEST_CASE("Testing parallel_for with partitioners") {
416     using namespace test_partitioner_utils::interaction_with_range_and_partitioner;
417 
418     test_partitioner_utils::SimpleBody b;
419     oneapi::tbb::affinity_partitioner ap;
420 
421     parallel_for(Range1(true, false), b, ap);
422     parallel_for(Range6(false, true), b, ap);
423 
424     parallel_for(Range1(false, true), b, oneapi::tbb::simple_partitioner());
425     parallel_for(Range6(false, true), b, oneapi::tbb::simple_partitioner());
426 
427     parallel_for(Range1(false, true), b, oneapi::tbb::auto_partitioner());
428     parallel_for(Range6(false, true), b, oneapi::tbb::auto_partitioner());
429 
430     parallel_for(Range1(true, false), b, oneapi::tbb::static_partitioner());
431     parallel_for(Range6(false, true), b, oneapi::tbb::static_partitioner());
432 }
433 
434 #if __TBB_CPP17_INVOKE_PRESENT
435 //! Test that parallel_for uses std::invoke to run body and function
436 //! \brief \ref interface \ref requirement
437 TEST_CASE("parallel_for and std::invoke") {
438     test_pfor_body_invoke();
439     test_pfor_func_invoke();
440 }
441 #endif
442