xref: /oneTBB/include/oneapi/tbb/parallel_for.h (revision 0a2b3987)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_for_H
18 #define __TBB_parallel_for_H
19 
20 #include "detail/_config.h"
21 #include "detail/_namespace_injection.h"
22 #include "detail/_exception.h"
23 #include "detail/_task.h"
24 #include "detail/_small_object_pool.h"
25 #include "profiling.h"
26 
27 #include "partitioner.h"
28 #include "blocked_range.h"
29 #include "task_group.h"
30 
31 #include <cstddef>
32 #include <new>
33 
34 namespace tbb {
35 namespace detail {
36 #if __TBB_CPP20_CONCEPTS_PRESENT
37 inline namespace d0 {
38 
39 template <typename Body, typename Range>
40 concept parallel_for_body = std::copy_constructible<Body> &&
41                             requires( const std::remove_reference_t<Body>& body, Range& range ) {
42                                 body(range);
43                             };
44 
45 template <typename Index>
46 concept parallel_for_index = std::constructible_from<Index, int> &&
47                              std::copyable<Index> &&
48                              requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
49                                  { lhs < rhs } -> adaptive_same_as<bool>;
50                                  { lhs - rhs } -> std::convertible_to<std::size_t>;
51                                  { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
52                              };
53 
54 template <typename Function, typename Index>
55 concept parallel_for_function = requires( const std::remove_reference_t<Function>& func, Index index ) {
56     func(index);
57 };
58 
59 } // namespace d0
60 #endif // __TBB_CPP20_CONCEPTS_PRESENT
61 namespace d1 {
62 
63 //! Task type used in parallel_for
64 /** @ingroup algorithms */
65 template<typename Range, typename Body, typename Partitioner>
66 struct start_for : public task {
67     Range my_range;
68     const Body my_body;
69     node* my_parent;
70 
71     typename Partitioner::task_partition_type my_partition;
72     small_object_allocator my_allocator;
73 
74     task* execute(execution_data&) override;
75     task* cancel(execution_data&) override;
76     void finalize(const execution_data&);
77 
78     //! Constructor for root task.
79     start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
80         my_range(range),
81         my_body(body),
82         my_partition(partitioner),
83         my_allocator(alloc) {}
84     //! Splitting constructor used to generate children.
85     /** parent_ becomes left child.  Newly constructed object is right child. */
86     start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
87         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
88         my_body(parent_.my_body),
89         my_partition(parent_.my_partition, split_obj),
90         my_allocator(alloc) {}
91     //! Construct right child from the given range as response to the demand.
92     /** parent_ remains left child.  Newly constructed object is right child. */
93     start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
94         my_range(r),
95         my_body(parent_.my_body),
96         my_partition(parent_.my_partition, split()),
97         my_allocator(alloc)
98     {
99         my_partition.align_depth( d );
100     }
101     static void run(const Range& range, const Body& body, Partitioner& partitioner) {
102         task_group_context context(PARALLEL_FOR);
103         run(range, body, partitioner, context);
104     }
105 
106     static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
107         if ( !range.empty() ) {
108             small_object_allocator alloc{};
109             start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
110 
111             // defer creation of the wait node until task allocation succeeds
112             wait_node wn;
113             for_task.my_parent = &wn;
114             execute_and_wait(for_task, context, wn.m_wait, context);
115         }
116     }
117     //! Run body for range, serves as callback for partitioner
118     void run_body( Range &r ) {
119         my_body( r );
120     }
121 
122     //! spawn right task, serves as callback for partitioner
123     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
124        offer_work_impl(ed, *this, split_obj);
125     }
126 
127     //! spawn right task, serves as callback for partitioner
128     void offer_work(const Range& r, depth_t d, execution_data& ed) {
129         offer_work_impl(ed, *this, r, d);
130     }
131 
132 private:
133     template <typename... Args>
134     void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
135         // New right child
136         small_object_allocator alloc{};
137         start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
138 
139         // New root node as a continuation and ref count. Left and right child attach to the new parent.
140         right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc);
141         // Spawn the right sibling
142         right_child.spawn_self(ed);
143     }
144 
145     void spawn_self(execution_data& ed) {
146         my_partition.spawn_task(*this, *context(ed));
147     }
148 };
149 
150 //! fold the tree and deallocate the task
151 template<typename Range, typename Body, typename Partitioner>
152 void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
153     // Get the current parent and allocator an object destruction
154     node* parent = my_parent;
155     auto allocator = my_allocator;
156     // Task execution finished - destroy it
157     this->~start_for();
158     // Unwind the tree decrementing the parent`s reference count
159 
160     fold_tree<tree_node>(parent, ed);
161     allocator.deallocate(this, ed);
162 
163 }
164 
165 //! execute task for parallel_for
166 template<typename Range, typename Body, typename Partitioner>
167 task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
168     if (!is_same_affinity(ed)) {
169         my_partition.note_affinity(execution_slot(ed));
170     }
171     my_partition.check_being_stolen(*this, ed);
172     my_partition.execute(*this, my_range, ed);
173     finalize(ed);
174     return nullptr;
175 }
176 
177 //! cancel task for parallel_for
178 template<typename Range, typename Body, typename Partitioner>
179 task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
180     finalize(ed);
181     return nullptr;
182 }
183 
184 //! Calls the function with values from range [begin, end) with a step provided
185 template<typename Function, typename Index>
186 class parallel_for_body_wrapper : detail::no_assign {
187     const Function &my_func;
188     const Index my_begin;
189     const Index my_step;
190 public:
191     parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
192         : my_func(_func), my_begin(_begin), my_step(_step) {}
193 
194     void operator()( const blocked_range<Index>& r ) const {
195         // A set of local variables to help the compiler with vectorization of the following loop.
196         Index b = r.begin();
197         Index e = r.end();
198         Index ms = my_step;
199         Index k = my_begin + b*ms;
200 
201 #if __INTEL_COMPILER
202 #pragma ivdep
203 #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
204 #pragma vector always assert
205 #endif
206 #endif
207         for ( Index i = b; i < e; ++i, k += ms ) {
208             my_func( k );
209         }
210     }
211 };
212 
213 // Requirements on Range concept are documented in blocked_range.h
214 
215 /** \page parallel_for_body_req Requirements on parallel_for body
216     Class \c Body implementing the concept of parallel_for body must define:
217     - \code Body::Body( const Body& ); \endcode                 Copy constructor
218     - \code Body::~Body(); \endcode                             Destructor
219     - \code void Body::operator()( Range& r ) const; \endcode   Function call operator applying the body to range \c r.
220 **/
221 
222 /** \name parallel_for
223     See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
224 //@{
225 
226 //! Parallel iteration over range with default partitioner.
227 /** @ingroup algorithms **/
228 template<typename Range, typename Body>
229     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
230 void parallel_for( const Range& range, const Body& body ) {
231     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
232 }
233 
234 //! Parallel iteration over range with simple partitioner.
235 /** @ingroup algorithms **/
236 template<typename Range, typename Body>
237     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
238 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
239     start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
240 }
241 
242 //! Parallel iteration over range with auto_partitioner.
243 /** @ingroup algorithms **/
244 template<typename Range, typename Body>
245     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
246 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
247     start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
248 }
249 
250 //! Parallel iteration over range with static_partitioner.
251 /** @ingroup algorithms **/
252 template<typename Range, typename Body>
253     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
254 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
255     start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
256 }
257 
258 //! Parallel iteration over range with affinity_partitioner.
259 /** @ingroup algorithms **/
260 template<typename Range, typename Body>
261     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
262 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
263     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
264 }
265 
266 //! Parallel iteration over range with default partitioner and user-supplied context.
267 /** @ingroup algorithms **/
268 template<typename Range, typename Body>
269     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
270 void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
271     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
272 }
273 
274 //! Parallel iteration over range with simple partitioner and user-supplied context.
275 /** @ingroup algorithms **/
276 template<typename Range, typename Body>
277     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
278 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
279     start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
280 }
281 
282 //! Parallel iteration over range with auto_partitioner and user-supplied context.
283 /** @ingroup algorithms **/
284 template<typename Range, typename Body>
285     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
286 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
287     start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
288 }
289 
290 //! Parallel iteration over range with static_partitioner and user-supplied context.
291 /** @ingroup algorithms **/
292 template<typename Range, typename Body>
293     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
294 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
295     start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
296 }
297 
298 //! Parallel iteration over range with affinity_partitioner and user-supplied context.
299 /** @ingroup algorithms **/
300 template<typename Range, typename Body>
301     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
302 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
303     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
304 }
305 
306 //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
307 template <typename Index, typename Function, typename Partitioner>
308 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
309     if (step <= 0 )
310         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
311     else if (first < last) {
312         // Above "else" avoids "potential divide by zero" warning on some platforms
313         Index end = (last - first - Index(1)) / step + Index(1);
314         blocked_range<Index> range(static_cast<Index>(0), end);
315         parallel_for_body_wrapper<Function, Index> body(f, first, step);
316         parallel_for(range, body, partitioner);
317     }
318 }
319 
320 //! Parallel iteration over a range of integers with a step provided and default partitioner
321 template <typename Index, typename Function>
322     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
323 void parallel_for(Index first, Index last, Index step, const Function& f) {
324     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
325 }
326 //! Parallel iteration over a range of integers with a step provided and simple partitioner
327 template <typename Index, typename Function>
328     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
329 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
330     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
331 }
332 //! Parallel iteration over a range of integers with a step provided and auto partitioner
333 template <typename Index, typename Function>
334     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
335 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
336     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
337 }
338 //! Parallel iteration over a range of integers with a step provided and static partitioner
339 template <typename Index, typename Function>
340     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
341 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
342     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
343 }
344 //! Parallel iteration over a range of integers with a step provided and affinity partitioner
345 template <typename Index, typename Function>
346     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
347 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
348     parallel_for_impl(first, last, step, f, partitioner);
349 }
350 
351 //! Parallel iteration over a range of integers with a default step value and default partitioner
352 template <typename Index, typename Function>
353     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
354 void parallel_for(Index first, Index last, const Function& f) {
355     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
356 }
357 //! Parallel iteration over a range of integers with a default step value and simple partitioner
358 template <typename Index, typename Function>
359     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
360 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
361     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
362 }
363 //! Parallel iteration over a range of integers with a default step value and auto partitioner
364 template <typename Index, typename Function>
365     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
366 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
367     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
368 }
369 //! Parallel iteration over a range of integers with a default step value and static partitioner
370 template <typename Index, typename Function>
371     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
372 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
373     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
374 }
375 //! Parallel iteration over a range of integers with a default step value and affinity partitioner
376 template <typename Index, typename Function>
377     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
378 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
379     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
380 }
381 
382 //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
383 template <typename Index, typename Function, typename Partitioner>
384 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
385     if (step <= 0 )
386         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
387     else if (first < last) {
388         // Above "else" avoids "potential divide by zero" warning on some platforms
389         Index end = (last - first - Index(1)) / step + Index(1);
390         blocked_range<Index> range(static_cast<Index>(0), end);
391         parallel_for_body_wrapper<Function, Index> body(f, first, step);
392         parallel_for(range, body, partitioner, context);
393     }
394 }
395 
396 //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
397 template <typename Index, typename Function>
398     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
399 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
400     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
401 }
402 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
403 template <typename Index, typename Function>
404     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
405 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
406     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
407 }
408 //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
409 template <typename Index, typename Function>
410     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
411 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
412     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
413 }
414 //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
415 template <typename Index, typename Function>
416     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
417 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
418     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
419 }
420 //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
421 template <typename Index, typename Function>
422     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
423 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
424     parallel_for_impl(first, last, step, f, partitioner, context);
425 }
426 
427 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
428 template <typename Index, typename Function>
429     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
430 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
431     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
432 }
433 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
434 template <typename Index, typename Function>
435     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
436 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
437     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
438 }
439 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
440 template <typename Index, typename Function>
441     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
442 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
443     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
444 }
445 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
446 template <typename Index, typename Function>
447     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
448 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
449     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
450 }
451 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
452 template <typename Index, typename Function>
453     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
454 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
455     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
456 }
457 // @}
458 
459 } // namespace d1
460 } // namespace detail
461 
462 inline namespace v1 {
463 using detail::d1::parallel_for;
464 // Split types
465 using detail::split;
466 using detail::proportional_split;
467 } // namespace v1
468 
469 } // namespace tbb
470 
471 #endif /* __TBB_parallel_for_H */
472