xref: /oneTBB/include/oneapi/tbb/parallel_for.h (revision bb45f092)
1 /*
2     Copyright (c) 2005-2022 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_for_H
18 #define __TBB_parallel_for_H
19 
20 #include "detail/_config.h"
21 #include "detail/_namespace_injection.h"
22 #include "detail/_exception.h"
23 #include "detail/_task.h"
24 #include "detail/_small_object_pool.h"
25 #include "profiling.h"
26 
27 #include "partitioner.h"
28 #include "blocked_range.h"
29 #include "task_group.h"
30 
31 #include <cstddef>
32 #include <new>
33 
34 namespace tbb {
35 namespace detail {
36 #if __TBB_CPP20_CONCEPTS_PRESENT
37 inline namespace d0 {
38 
39 template <typename Body, typename Range>
40 concept parallel_for_body = std::copy_constructible<Body> &&
41                             requires( const std::remove_reference_t<Body>& body, Range& range ) {
42                                 body(range);
43                             };
44 
45 template <typename Index>
46 concept parallel_for_index = std::constructible_from<Index, int> &&
47                              std::copyable<Index> &&
48                              requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
49                                  { lhs < rhs } -> adaptive_same_as<bool>;
50                                  { lhs - rhs } -> std::convertible_to<std::size_t>;
51                                  { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
52                              };
53 
54 template <typename Function, typename Index>
55 concept parallel_for_function = requires( const std::remove_reference_t<Function>& func, Index index ) {
56     func(index);
57 };
58 
59 } // namespace d0
60 #endif // __TBB_CPP20_CONCEPTS_PRESENT
61 namespace d1 {
62 
63 //! Task type used in parallel_for
64 /** @ingroup algorithms */
65 template<typename Range, typename Body, typename Partitioner>
66 struct start_for : public task {
67     Range my_range;
68     const Body my_body;
69     node* my_parent;
70 
71     typename Partitioner::task_partition_type my_partition;
72     small_object_allocator my_allocator;
73 
74     task* execute(execution_data&) override;
75     task* cancel(execution_data&) override;
76     void finalize(const execution_data&);
77 
78     //! Constructor for root task.
79     start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
80         my_range(range),
81         my_body(body),
82         my_parent(nullptr),
83         my_partition(partitioner),
84         my_allocator(alloc) {}
85     //! Splitting constructor used to generate children.
86     /** parent_ becomes left child.  Newly constructed object is right child. */
87     start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
88         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
89         my_body(parent_.my_body),
90         my_parent(nullptr),
91         my_partition(parent_.my_partition, split_obj),
92         my_allocator(alloc) {}
93     //! Construct right child from the given range as response to the demand.
94     /** parent_ remains left child.  Newly constructed object is right child. */
95     start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
96         my_range(r),
97         my_body(parent_.my_body),
98         my_parent(nullptr),
99         my_partition(parent_.my_partition, split()),
100         my_allocator(alloc)
101     {
102         my_partition.align_depth( d );
103     }
104     static void run(const Range& range, const Body& body, Partitioner& partitioner) {
105         task_group_context context(PARALLEL_FOR);
106         run(range, body, partitioner, context);
107     }
108 
109     static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
110         if ( !range.empty() ) {
111             small_object_allocator alloc{};
112             start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
113 
114             // defer creation of the wait node until task allocation succeeds
115             wait_node wn;
116             for_task.my_parent = &wn;
117             execute_and_wait(for_task, context, wn.m_wait, context);
118         }
119     }
120     //! Run body for range, serves as callback for partitioner
121     void run_body( Range &r ) {
122         my_body( r );
123     }
124 
125     //! spawn right task, serves as callback for partitioner
126     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
127        offer_work_impl(ed, *this, split_obj);
128     }
129 
130     //! spawn right task, serves as callback for partitioner
131     void offer_work(const Range& r, depth_t d, execution_data& ed) {
132         offer_work_impl(ed, *this, r, d);
133     }
134 
135 private:
136     template <typename... Args>
137     void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
138         // New right child
139         small_object_allocator alloc{};
140         start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
141 
142         // New root node as a continuation and ref count. Left and right child attach to the new parent.
143         right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc);
144         // Spawn the right sibling
145         right_child.spawn_self(ed);
146     }
147 
148     void spawn_self(execution_data& ed) {
149         my_partition.spawn_task(*this, *context(ed));
150     }
151 };
152 
153 //! fold the tree and deallocate the task
154 template<typename Range, typename Body, typename Partitioner>
155 void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
156     // Get the current parent and allocator an object destruction
157     node* parent = my_parent;
158     auto allocator = my_allocator;
159     // Task execution finished - destroy it
160     this->~start_for();
161     // Unwind the tree decrementing the parent`s reference count
162 
163     fold_tree<tree_node>(parent, ed);
164     allocator.deallocate(this, ed);
165 
166 }
167 
168 //! execute task for parallel_for
169 template<typename Range, typename Body, typename Partitioner>
170 task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
171     if (!is_same_affinity(ed)) {
172         my_partition.note_affinity(execution_slot(ed));
173     }
174     my_partition.check_being_stolen(*this, ed);
175     my_partition.execute(*this, my_range, ed);
176     finalize(ed);
177     return nullptr;
178 }
179 
180 //! cancel task for parallel_for
181 template<typename Range, typename Body, typename Partitioner>
182 task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
183     finalize(ed);
184     return nullptr;
185 }
186 
187 //! Calls the function with values from range [begin, end) with a step provided
188 template<typename Function, typename Index>
189 class parallel_for_body_wrapper : detail::no_assign {
190     const Function &my_func;
191     const Index my_begin;
192     const Index my_step;
193 public:
194     parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
195         : my_func(_func), my_begin(_begin), my_step(_step) {}
196 
197     void operator()( const blocked_range<Index>& r ) const {
198         // A set of local variables to help the compiler with vectorization of the following loop.
199         Index b = r.begin();
200         Index e = r.end();
201         Index ms = my_step;
202         Index k = my_begin + b*ms;
203 
204 #if __INTEL_COMPILER
205 #pragma ivdep
206 #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
207 #pragma vector always assert
208 #endif
209 #endif
210         for ( Index i = b; i < e; ++i, k += ms ) {
211             my_func( k );
212         }
213     }
214 };
215 
216 // Requirements on Range concept are documented in blocked_range.h
217 
218 /** \page parallel_for_body_req Requirements on parallel_for body
219     Class \c Body implementing the concept of parallel_for body must define:
220     - \code Body::Body( const Body& ); \endcode                 Copy constructor
221     - \code Body::~Body(); \endcode                             Destructor
222     - \code void Body::operator()( Range& r ) const; \endcode   Function call operator applying the body to range \c r.
223 **/
224 
225 /** \name parallel_for
226     See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
227 //@{
228 
229 //! Parallel iteration over range with default partitioner.
230 /** @ingroup algorithms **/
231 template<typename Range, typename Body>
232     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
233 void parallel_for( const Range& range, const Body& body ) {
234     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
235 }
236 
237 //! Parallel iteration over range with simple partitioner.
238 /** @ingroup algorithms **/
239 template<typename Range, typename Body>
240     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
241 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
242     start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
243 }
244 
245 //! Parallel iteration over range with auto_partitioner.
246 /** @ingroup algorithms **/
247 template<typename Range, typename Body>
248     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
249 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
250     start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
251 }
252 
253 //! Parallel iteration over range with static_partitioner.
254 /** @ingroup algorithms **/
255 template<typename Range, typename Body>
256     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
257 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
258     start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
259 }
260 
261 //! Parallel iteration over range with affinity_partitioner.
262 /** @ingroup algorithms **/
263 template<typename Range, typename Body>
264     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
265 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
266     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
267 }
268 
269 //! Parallel iteration over range with default partitioner and user-supplied context.
270 /** @ingroup algorithms **/
271 template<typename Range, typename Body>
272     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
273 void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
274     start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
275 }
276 
277 //! Parallel iteration over range with simple partitioner and user-supplied context.
278 /** @ingroup algorithms **/
279 template<typename Range, typename Body>
280     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
281 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
282     start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
283 }
284 
285 //! Parallel iteration over range with auto_partitioner and user-supplied context.
286 /** @ingroup algorithms **/
287 template<typename Range, typename Body>
288     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
289 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
290     start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
291 }
292 
293 //! Parallel iteration over range with static_partitioner and user-supplied context.
294 /** @ingroup algorithms **/
295 template<typename Range, typename Body>
296     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
297 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
298     start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
299 }
300 
301 //! Parallel iteration over range with affinity_partitioner and user-supplied context.
302 /** @ingroup algorithms **/
303 template<typename Range, typename Body>
304     __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
305 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
306     start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
307 }
308 
309 //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
310 template <typename Index, typename Function, typename Partitioner>
311 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
312     if (step <= 0 )
313         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
314     else if (first < last) {
315         // Above "else" avoids "potential divide by zero" warning on some platforms
316         Index end = (last - first - Index(1)) / step + Index(1);
317         blocked_range<Index> range(static_cast<Index>(0), end);
318         parallel_for_body_wrapper<Function, Index> body(f, first, step);
319         parallel_for(range, body, partitioner);
320     }
321 }
322 
323 //! Parallel iteration over a range of integers with a step provided and default partitioner
324 template <typename Index, typename Function>
325     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
326 void parallel_for(Index first, Index last, Index step, const Function& f) {
327     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
328 }
329 //! Parallel iteration over a range of integers with a step provided and simple partitioner
330 template <typename Index, typename Function>
331     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
332 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
333     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
334 }
335 //! Parallel iteration over a range of integers with a step provided and auto partitioner
336 template <typename Index, typename Function>
337     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
338 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
339     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
340 }
341 //! Parallel iteration over a range of integers with a step provided and static partitioner
342 template <typename Index, typename Function>
343     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
344 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
345     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
346 }
347 //! Parallel iteration over a range of integers with a step provided and affinity partitioner
348 template <typename Index, typename Function>
349     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
350 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
351     parallel_for_impl(first, last, step, f, partitioner);
352 }
353 
354 //! Parallel iteration over a range of integers with a default step value and default partitioner
355 template <typename Index, typename Function>
356     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
357 void parallel_for(Index first, Index last, const Function& f) {
358     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
359 }
360 //! Parallel iteration over a range of integers with a default step value and simple partitioner
361 template <typename Index, typename Function>
362     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
363 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
364     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
365 }
366 //! Parallel iteration over a range of integers with a default step value and auto partitioner
367 template <typename Index, typename Function>
368     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
369 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
370     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
371 }
372 //! Parallel iteration over a range of integers with a default step value and static partitioner
373 template <typename Index, typename Function>
374     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
375 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
376     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
377 }
378 //! Parallel iteration over a range of integers with a default step value and affinity partitioner
379 template <typename Index, typename Function>
380     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
381 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
382     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
383 }
384 
385 //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
386 template <typename Index, typename Function, typename Partitioner>
387 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
388     if (step <= 0 )
389         throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
390     else if (first < last) {
391         // Above "else" avoids "potential divide by zero" warning on some platforms
392         Index end = (last - first - Index(1)) / step + Index(1);
393         blocked_range<Index> range(static_cast<Index>(0), end);
394         parallel_for_body_wrapper<Function, Index> body(f, first, step);
395         parallel_for(range, body, partitioner, context);
396     }
397 }
398 
399 //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
400 template <typename Index, typename Function>
401     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
402 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
403     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
404 }
405 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
406 template <typename Index, typename Function>
407     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
408 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
409     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
410 }
411 //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
412 template <typename Index, typename Function>
413     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
414 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
415     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
416 }
417 //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
418 template <typename Index, typename Function>
419     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
420 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
421     parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
422 }
423 //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
424 template <typename Index, typename Function>
425     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
426 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
427     parallel_for_impl(first, last, step, f, partitioner, context);
428 }
429 
430 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
431 template <typename Index, typename Function>
432     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
433 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
434     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
435 }
436 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
437 template <typename Index, typename Function>
438     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
439 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
440     parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
441 }
442 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
443 template <typename Index, typename Function>
444     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
445 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
446     parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
447 }
448 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
449 template <typename Index, typename Function>
450     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
451 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
452     parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
453 }
454 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
455 template <typename Index, typename Function>
456     __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
457 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
458     parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
459 }
460 // @}
461 
462 } // namespace d1
463 } // namespace detail
464 
465 inline namespace v1 {
466 using detail::d1::parallel_for;
467 // Split types
468 using detail::split;
469 using detail::proportional_split;
470 } // namespace v1
471 
472 } // namespace tbb
473 
474 #endif /* __TBB_parallel_for_H */
475