xref: /oneTBB/include/oneapi/tbb/parallel_for.h (revision ea4e6156)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_for_H
18 #define __TBB_parallel_for_H
19 
20 #include "detail/_config.h"
21 #include "detail/_namespace_injection.h"
22 #include "detail/_exception.h"
23 #include "detail/_task.h"
24 #include "detail/_small_object_pool.h"
25 #include "profiling.h"
26 
27 #include "partitioner.h"
28 #include "blocked_range.h"
29 #include "task_group.h"
30 
31 #include <cstddef>
32 #include <new>
33 
34 namespace tbb {
35 namespace detail {
36 #if __TBB_CPP20_CONCEPTS_PRESENT
37 inline namespace d0 {
38 
39 template <typename Body, typename Range>
40 concept parallel_for_body = std::copy_constructible<Body> && std::invocable<const std::remove_reference_t<Body>&, Range&>;
41 
42 template <typename Index>
43 concept parallel_for_index = std::constructible_from<Index, int> &&
44                              std::copyable<Index> &&
45                              requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
46                                  { lhs < rhs } -> adaptive_same_as<bool>;
47                                  { lhs - rhs } -> std::convertible_to<std::size_t>;
48                                  { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
49                              };
50 
51 template <typename Function, typename Index>
52 concept parallel_for_function = std::invocable<const std::remove_reference_t<Function>&, Index>;
53 
54 } // namespace d0
55 #endif // __TBB_CPP20_CONCEPTS_PRESENT
56 namespace d1 {
57 
58 //! Task type used in parallel_for
59 /** @ingroup algorithms */
60 template<typename Range, typename Body, typename Partitioner>
61 struct start_for : public task {
62     Range my_range;
63     const Body my_body;
64     node* my_parent;
65 
66     typename Partitioner::task_partition_type my_partition;
67     small_object_allocator my_allocator;
68 
69     task* execute(execution_data&) override;
70     task* cancel(execution_data&) override;
71     void finalize(const execution_data&);
72 
73     //! Constructor for root task.
74     start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
75         my_range(range),
76         my_body(body),
77         my_parent(nullptr),
78         my_partition(partitioner),
79         my_allocator(alloc) {}
80     //! Splitting constructor used to generate children.
81     /** parent_ becomes left child.  Newly constructed object is right child. */
82     start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
83         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
84         my_body(parent_.my_body),
85         my_parent(nullptr),
86         my_partition(parent_.my_partition, split_obj),
87         my_allocator(alloc) {}
88     //! Construct right child from the given range as response to the demand.
89     /** parent_ remains left child.  Newly constructed object is right child. */
90     start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
91         my_range(r),
92         my_body(parent_.my_body),
93         my_parent(nullptr),
94         my_partition(parent_.my_partition, split()),
95         my_allocator(alloc)
96     {
97         my_partition.align_depth( d );
98     }
99     static void run(const Range& range, const Body& body, Partitioner& partitioner) {
100         task_group_context context(PARALLEL_FOR);
101         run(range, body, partitioner, context);
102     }
103 
104     static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
105         if ( !range.empty() ) {
106             small_object_allocator alloc{};
107             start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
108 
109             // defer creation of the wait node until task allocation succeeds
110             wait_node wn;
111             for_task.my_parent = &wn;
112             execute_and_wait(for_task, context, wn.m_wait, context);
113         }
114     }
115     //! Run body for range, serves as callback for partitioner
116     void run_body( Range &r ) {
117         tbb::detail::invoke(my_body, r);
118     }
119 
120     //! spawn right task, serves as callback for partitioner
121     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
122        offer_work_impl(ed, *this, split_obj);
123     }
124 
125     //! spawn right task, serves as callback for partitioner
126     void offer_work(const Range& r, depth_t d, execution_data& ed) {
127         offer_work_impl(ed, *this, r, d);
128     }
129 
130 private:
131     template <typename... Args>
132     void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
133         // New right child
134         small_object_allocator alloc{};
135         start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
136 
137         // New root node as a continuation and ref count. Left and right child attach to the new parent.
138         right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc);
139         // Spawn the right sibling
140         right_child.spawn_self(ed);
141     }
142 
143     void spawn_self(execution_data& ed) {
144         my_partition.spawn_task(*this, *context(ed));
145     }
146 };
147 
148 //! fold the tree and deallocate the task
149 template<typename Range, typename Body, typename Partitioner>
150 void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
151     // Get the current parent and allocator an object destruction
152     node* parent = my_parent;
153     auto allocator = my_allocator;
154     // Task execution finished - destroy it
155     this->~start_for();
156     // Unwind the tree decrementing the parent`s reference count
157 
158     fold_tree<tree_node>(parent, ed);
159     allocator.deallocate(this, ed);
160 
161 }
162 
163 //! execute task for parallel_for
164 template<typename Range, typename Body, typename Partitioner>
165 task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
166     if (!is_same_affinity(ed)) {
167         my_partition.note_affinity(execution_slot(ed));
168     }
169     my_partition.check_being_stolen(*this, ed);
170     my_partition.execute(*this, my_range, ed);
171     finalize(ed);
172     return nullptr;
173 }
174 
175 //! cancel task for parallel_for
176 template<typename Range, typename Body, typename Partitioner>
177 task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
178     finalize(ed);
179     return nullptr;
180 }
181 
182 //! Calls the function with values from range [begin, end) with a step provided
183 template<typename Function, typename Index>
184 class parallel_for_body_wrapper : detail::no_assign {
185     const Function &my_func;
186     const Index my_begin;
187     const Index my_step;
188 public:
189     parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
190         : my_func(_func), my_begin(_begin), my_step(_step) {}
191 
192     void operator()( const blocked_range<Index>& r ) const {
193         // A set of local variables to help the compiler with vectorization of the following loop.
194         Index b = r.begin();
195         Index e = r.end();
196         Index ms = my_step;
197         Index k = my_begin + b*ms;
198 
199 #if __INTEL_COMPILER
200 #pragma ivdep
201 #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
202 #pragma vector always assert
203 #endif
204 #endif
205         for ( Index i = b; i < e; ++i, k += ms ) {
206             tbb::detail::invoke(my_func, k);
207         }
208     }
209 };
210 
211 // Requirements on Range concept are documented in blocked_range.h
212 
213 /** \page parallel_for_body_req Requirements on parallel_for body
214     Class \c Body implementing the concept of parallel_for body must define:
215     - \code Body::Body( const Body& ); \endcode                 Copy constructor
216     - \code Body::~Body(); \endcode                             Destructor
217     - \code void Body::operator()( Range& r ) const; \endcode   Function call operator applying the body to range \c r.
218 **/
219 
220 /** \name parallel_for
221     See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
222 //@{
223 
224 //! Parallel iteration over range with default partitioner.
225 /** @ingroup algorithms **/
226 template<typename Range, typename Body>
227     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
228 void parallel_for( const Range& range, const Body& body ) {
229     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
230 }
231 
232 //! Parallel iteration over range with simple partitioner.
233 /** @ingroup algorithms **/
234 template<typename Range, typename Body>
235     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
236 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
237     start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
238 }
239 
240 //! Parallel iteration over range with auto_partitioner.
241 /** @ingroup algorithms **/
242 template<typename Range, typename Body>
243     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
244 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
245     start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
246 }
247 
248 //! Parallel iteration over range with static_partitioner.
249 /** @ingroup algorithms **/
250 template<typename Range, typename Body>
251     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
252 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
253     start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
254 }
255 
256 //! Parallel iteration over range with affinity_partitioner.
257 /** @ingroup algorithms **/
258 template<typename Range, typename Body>
259     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
260 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
261     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
262 }
263 
264 //! Parallel iteration over range with default partitioner and user-supplied context.
265 /** @ingroup algorithms **/
266 template<typename Range, typename Body>
267     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
268 void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
269     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
270 }
271 
272 //! Parallel iteration over range with simple partitioner and user-supplied context.
273 /** @ingroup algorithms **/
274 template<typename Range, typename Body>
275     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
276 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
277     start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
278 }
279 
280 //! Parallel iteration over range with auto_partitioner and user-supplied context.
281 /** @ingroup algorithms **/
282 template<typename Range, typename Body>
283     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
284 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
285     start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
286 }
287 
288 //! Parallel iteration over range with static_partitioner and user-supplied context.
289 /** @ingroup algorithms **/
290 template<typename Range, typename Body>
291     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
292 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
293     start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
294 }
295 
296 //! Parallel iteration over range with affinity_partitioner and user-supplied context.
297 /** @ingroup algorithms **/
298 template<typename Range, typename Body>
299     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
300 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
301     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
302 }
303 
304 //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
305 template <typename Index, typename Function, typename Partitioner>
306 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
307     if (step <= 0 )
308         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
309     else if (first < last) {
310         // Above "else" avoids "potential divide by zero" warning on some platforms
311         Index end = Index(last - first - 1ul) / step + Index(1);
312         blocked_range<Index> range(static_cast<Index>(0), end);
313         parallel_for_body_wrapper<Function, Index> body(f, first, step);
314         parallel_for(range, body, partitioner);
315     }
316 }
317 
318 //! Parallel iteration over a range of integers with a step provided and default partitioner
319 template <typename Index, typename Function>
320     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
321 void parallel_for(Index first, Index last, Index step, const Function& f) {
322     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
323 }
324 //! Parallel iteration over a range of integers with a step provided and simple partitioner
325 template <typename Index, typename Function>
326     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
327 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
328     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
329 }
330 //! Parallel iteration over a range of integers with a step provided and auto partitioner
331 template <typename Index, typename Function>
332     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
333 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
334     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
335 }
336 //! Parallel iteration over a range of integers with a step provided and static partitioner
337 template <typename Index, typename Function>
338     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
339 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
340     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
341 }
342 //! Parallel iteration over a range of integers with a step provided and affinity partitioner
343 template <typename Index, typename Function>
344     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
345 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
346     parallel_for_impl(first, last, step, f, partitioner);
347 }
348 
349 //! Parallel iteration over a range of integers with a default step value and default partitioner
350 template <typename Index, typename Function>
351     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
352 void parallel_for(Index first, Index last, const Function& f) {
353     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
354 }
355 //! Parallel iteration over a range of integers with a default step value and simple partitioner
356 template <typename Index, typename Function>
357     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
358 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
359     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
360 }
361 //! Parallel iteration over a range of integers with a default step value and auto partitioner
362 template <typename Index, typename Function>
363     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
364 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
365     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
366 }
367 //! Parallel iteration over a range of integers with a default step value and static partitioner
368 template <typename Index, typename Function>
369     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
370 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
371     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
372 }
373 //! Parallel iteration over a range of integers with a default step value and affinity partitioner
374 template <typename Index, typename Function>
375     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
376 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
377     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
378 }
379 
380 //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
381 template <typename Index, typename Function, typename Partitioner>
382 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
383     if (step <= 0 )
384         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
385     else if (first < last) {
386         // Above "else" avoids "potential divide by zero" warning on some platforms
387         Index end = (last - first - Index(1)) / step + Index(1);
388         blocked_range<Index> range(static_cast<Index>(0), end);
389         parallel_for_body_wrapper<Function, Index> body(f, first, step);
390         parallel_for(range, body, partitioner, context);
391     }
392 }
393 
394 //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
395 template <typename Index, typename Function>
396     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
397 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
398     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
399 }
400 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
401 template <typename Index, typename Function>
402     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
403 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
404     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
405 }
406 //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
407 template <typename Index, typename Function>
408     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
409 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
410     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
411 }
412 //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
413 template <typename Index, typename Function>
414     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
415 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
416     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
417 }
418 //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
419 template <typename Index, typename Function>
420     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
421 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
422     parallel_for_impl(first, last, step, f, partitioner, context);
423 }
424 
425 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
426 template <typename Index, typename Function>
427     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
428 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
429     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
430 }
431 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
432 template <typename Index, typename Function>
433     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
434 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
435     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
436 }
437 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
438 template <typename Index, typename Function>
439     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
440 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
441     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
442 }
443 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
444 template <typename Index, typename Function>
445     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
446 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
447     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
448 }
449 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
450 template <typename Index, typename Function>
451     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
452 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
453     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
454 }
455 // @}
456 
457 } // namespace d1
458 } // namespace detail
459 
460 inline namespace v1 {
461 using detail::d1::parallel_for;
462 // Split types
463 using detail::split;
464 using detail::proportional_split;
465 } // namespace v1
466 
467 } // namespace tbb
468 
469 #endif /* __TBB_parallel_for_H */
470