xref: /oneTBB/include/oneapi/tbb/parallel_reduce.h (revision 3c162e4e)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_reduce_H
18 #define __TBB_parallel_reduce_H
19 
20 #include <new>
21 #include "detail/_namespace_injection.h"
22 #include "detail/_task.h"
23 #include "detail/_aligned_space.h"
24 #include "detail/_small_object_pool.h"
25 #include "detail/_range_common.h"
26 
27 #include "task_group.h" // task_group_context
28 #include "partitioner.h"
29 #include "profiling.h"
30 
31 namespace tbb {
32 namespace detail {
33 #if __TBB_CPP20_CONCEPTS_PRESENT
34 inline namespace d0 {
35 
36 template <typename Body, typename Range>
37 concept parallel_reduce_body = splittable<Body> &&
38                                requires( Body& body, const Range& range, Body& rhs ) {
39                                    body(range);
40                                    body.join(rhs);
41                                };
42 
43 template <typename Function, typename Range, typename Value>
44 concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
45                                                   const Range&, const Value&> &&
46                                    std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
47                                                                             const Range&, const Value&>,
48                                                         Value>;
49 
50 template <typename Combine, typename Value>
51 concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
52                                                  const Value&, const Value&> &&
53                                   std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
54                                                                            const Value&, const Value&>,
55                                                       Value>;
56 
57 } // namespace d0
58 #endif // __TBB_CPP20_CONCEPTS_PRESENT
59 namespace d1 {
60 
61 //! Tree node type for parallel_reduce.
62 /** @ingroup algorithms */
63 //TODO: consider folding tree via bypass execution(instead of manual folding)
64 // for better cancellation and critical tasks handling (performance measurements required).
65 template<typename Body>
66 struct reduction_tree_node : public tree_node {
67     tbb::detail::aligned_space<Body> zombie_space;
68     Body& left_body;
69     bool has_right_zombie{false};
70 
71     reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
72         tree_node{parent, ref_count, alloc},
73         left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */
74     {}
75 
76     void join(task_group_context* context) {
77         if (has_right_zombie && !context->is_group_execution_cancelled())
78             left_body.join(*zombie_space.begin());
79     }
80 
81     ~reduction_tree_node() {
82         if( has_right_zombie ) zombie_space.begin()->~Body();
83     }
84 };
85 
86 //! Task type used to split the work of parallel_reduce.
87 /** @ingroup algorithms */
88 template<typename Range, typename Body, typename Partitioner>
89 struct start_reduce : public task {
90     Range my_range;
91     Body* my_body;
92     node* my_parent;
93 
94     typename Partitioner::task_partition_type my_partition;
95     small_object_allocator my_allocator;
96     bool is_right_child;
97 
98     task* execute(execution_data&) override;
99     task* cancel(execution_data&) override;
100     void finalize(const execution_data&);
101 
102     using tree_node_type = reduction_tree_node<Body>;
103 
104     //! Constructor reduce root task.
105     start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
106         my_range(range),
107         my_body(&body),
108         my_parent(nullptr),
109         my_partition(partitioner),
110         my_allocator(alloc),
111         is_right_child(false) {}
112     //! Splitting constructor used to generate children.
113     /** parent_ becomes left child. Newly constructed object is right child. */
114     start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
115         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
116         my_body(parent_.my_body),
117         my_parent(nullptr),
118         my_partition(parent_.my_partition, split_obj),
119         my_allocator(alloc),
120         is_right_child(true)
121     {
122         parent_.is_right_child = false;
123     }
124     //! Construct right child from the given range as response to the demand.
125     /** parent_ remains left child. Newly constructed object is right child. */
126     start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
127         my_range(r),
128         my_body(parent_.my_body),
129         my_parent(nullptr),
130         my_partition(parent_.my_partition, split()),
131         my_allocator(alloc),
132         is_right_child(true)
133     {
134         my_partition.align_depth( d );
135         parent_.is_right_child = false;
136     }
137     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
138         if ( !range.empty() ) {
139             wait_node wn;
140             small_object_allocator alloc{};
141             auto reduce_task = alloc.new_object<start_reduce>(range, body, partitioner, alloc);
142             reduce_task->my_parent = &wn;
143             execute_and_wait(*reduce_task, context, wn.m_wait, context);
144         }
145     }
146     static void run(const Range& range, Body& body, Partitioner& partitioner) {
147         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
148         // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block.
149         task_group_context context(PARALLEL_REDUCE);
150         run(range, body, partitioner, context);
151     }
152     //! Run body for range, serves as callback for partitioner
153     void run_body( Range &r ) {
154         tbb::detail::invoke(*my_body, r);
155     }
156 
157     //! spawn right task, serves as callback for partitioner
158     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
159         offer_work_impl(ed, *this, split_obj);
160     }
161     //! spawn right task, serves as callback for partitioner
162     void offer_work(const Range& r, depth_t d, execution_data& ed) {
163         offer_work_impl(ed, *this, r, d);
164     }
165 
166 private:
167     template <typename... Args>
168     void offer_work_impl(execution_data& ed, Args&&... args) {
169         small_object_allocator alloc{};
170         // New right child
171         auto right_child = alloc.new_object<start_reduce>(ed, std::forward<Args>(args)..., alloc);
172 
173         // New root node as a continuation and ref count. Left and right child attach to the new parent.
174         right_child->my_parent = my_parent = alloc.new_object<tree_node_type>(ed, my_parent, 2, *my_body, alloc);
175 
176         // Spawn the right sibling
177         right_child->spawn_self(ed);
178     }
179 
180     void spawn_self(execution_data& ed) {
181         my_partition.spawn_task(*this, *context(ed));
182     }
183 };
184 
185 //! fold the tree and deallocate the task
186 template<typename Range, typename Body, typename Partitioner>
187 void start_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
188     // Get the current parent and wait object before an object destruction
189     node* parent = my_parent;
190     auto allocator = my_allocator;
191     // Task execution finished - destroy it
192     this->~start_reduce();
193     // Unwind the tree decrementing the parent`s reference count
194     fold_tree<tree_node_type>(parent, ed);
195     allocator.deallocate(this, ed);
196 }
197 
198 //! Execute parallel_reduce task
199 template<typename Range, typename Body, typename Partitioner>
200 task* start_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
201     if (!is_same_affinity(ed)) {
202         my_partition.note_affinity(execution_slot(ed));
203     }
204     my_partition.check_being_stolen(*this, ed);
205 
206     // The acquire barrier synchronizes the data pointed with my_body if the left
207     // task has already finished.
208     __TBB_ASSERT(my_parent, nullptr);
209     if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) {
210         tree_node_type* parent_ptr = static_cast<tree_node_type*>(my_parent);
211         my_body = static_cast<Body*>(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split()));
212         parent_ptr->has_right_zombie = true;
213     }
214     __TBB_ASSERT(my_body != nullptr, "Incorrect body value");
215 
216     my_partition.execute(*this, my_range, ed);
217 
218     finalize(ed);
219     return nullptr;
220 }
221 
222 //! Cancel parallel_reduce task
223 template<typename Range, typename Body, typename Partitioner>
224 task* start_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
225     finalize(ed);
226     return nullptr;
227 }
228 
229 //! Tree node type for parallel_deterministic_reduce.
230 /** @ingroup algorithms */
231 template<typename Body>
232 struct deterministic_reduction_tree_node : public tree_node {
233     Body right_body;
234     Body& left_body;
235 
236     deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
237         tree_node{parent, ref_count, alloc},
238         right_body{input_left_body, detail::split()},
239         left_body(input_left_body)
240     {}
241 
242     void join(task_group_context* context) {
243         if (!context->is_group_execution_cancelled())
244             left_body.join(right_body);
245     }
246 };
247 
248 //! Task type used to split the work of parallel_deterministic_reduce.
249 /** @ingroup algorithms */
250 template<typename Range, typename Body, typename Partitioner>
251 struct start_deterministic_reduce : public task {
252     Range my_range;
253     Body& my_body;
254     node* my_parent;
255 
256     typename Partitioner::task_partition_type my_partition;
257     small_object_allocator my_allocator;
258 
259     task* execute(execution_data&) override;
260     task* cancel(execution_data&) override;
261     void finalize(const execution_data&);
262 
263     using tree_node_type = deterministic_reduction_tree_node<Body>;
264 
265     //! Constructor deterministic_reduce root task.
266     start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) :
267         my_range(range),
268         my_body(body),
269         my_parent(nullptr),
270         my_partition(partitioner),
271         my_allocator(alloc) {}
272     //! Splitting constructor used to generate children.
273     /** parent_ becomes left child.  Newly constructed object is right child. */
274     start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
275                                 small_object_allocator& alloc ) :
276         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
277         my_body(body),
278         my_parent(nullptr),
279         my_partition(parent_.my_partition, split_obj),
280         my_allocator(alloc) {}
281     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
282         if ( !range.empty() ) {
283             wait_node wn;
284             small_object_allocator alloc{};
285             auto deterministic_reduce_task =
286                 alloc.new_object<start_deterministic_reduce>(range, partitioner, body, alloc);
287             deterministic_reduce_task->my_parent = &wn;
288             execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context);
289         }
290     }
291     static void run(const Range& range, Body& body, Partitioner& partitioner) {
292         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
293         // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce
294         // in the try-block.
295         task_group_context context(PARALLEL_REDUCE);
296         run(range, body, partitioner, context);
297     }
298     //! Run body for range, serves as callback for partitioner
299     void run_body( Range &r ) {
300         tbb::detail::invoke(my_body, r);
301     }
302     //! Spawn right task, serves as callback for partitioner
303     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
304         offer_work_impl(ed, *this, split_obj);
305     }
306 private:
307     template <typename... Args>
308     void offer_work_impl(execution_data& ed, Args&&... args) {
309         small_object_allocator alloc{};
310         // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body.
311         auto new_tree_node = alloc.new_object<tree_node_type>(ed, my_parent, 2, my_body, alloc);
312 
313         // New right child
314         auto right_child = alloc.new_object<start_deterministic_reduce>(ed, std::forward<Args>(args)..., new_tree_node->right_body, alloc);
315 
316         right_child->my_parent = my_parent = new_tree_node;
317 
318         // Spawn the right sibling
319         right_child->spawn_self(ed);
320     }
321 
322     void spawn_self(execution_data& ed) {
323         my_partition.spawn_task(*this, *context(ed));
324     }
325 };
326 
327 //! Fold the tree and deallocate the task
328 template<typename Range, typename Body, typename Partitioner>
329 void start_deterministic_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
330     // Get the current parent and wait object before an object destruction
331     node* parent = my_parent;
332 
333     auto allocator = my_allocator;
334     // Task execution finished - destroy it
335     this->~start_deterministic_reduce();
336     // Unwind the tree decrementing the parent`s reference count
337     fold_tree<tree_node_type>(parent, ed);
338     allocator.deallocate(this, ed);
339 }
340 
341 //! Execute parallel_deterministic_reduce task
342 template<typename Range, typename Body, typename Partitioner>
343 task* start_deterministic_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
344     if (!is_same_affinity(ed)) {
345         my_partition.note_affinity(execution_slot(ed));
346     }
347     my_partition.check_being_stolen(*this, ed);
348 
349     my_partition.execute(*this, my_range, ed);
350 
351     finalize(ed);
352     return nullptr;
353 }
354 
355 //! Cancel parallel_deterministic_reduce task
356 template<typename Range, typename Body, typename Partitioner>
357 task* start_deterministic_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
358     finalize(ed);
359     return nullptr;
360 }
361 
362 
363 //! Auxiliary class for parallel_reduce; for internal use only.
364 /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
365     using given \ref parallel_reduce_lambda_req "anonymous function objects".
366  **/
367 /** @ingroup algorithms */
368 template<typename Range, typename Value, typename RealBody, typename Reduction>
369 class lambda_reduce_body {
370 //TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced
371 //       (might require some performance measurements)
372 
373     const Value&     my_identity_element;
374     const RealBody&  my_real_body;
375     const Reduction& my_reduction;
376     Value            my_value;
377     lambda_reduce_body& operator= ( const lambda_reduce_body& other );
378 public:
379     lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
380         : my_identity_element(identity)
381         , my_real_body(body)
382         , my_reduction(reduction)
383         , my_value(identity)
384     { }
385     lambda_reduce_body( const lambda_reduce_body& other ) = default;
386     lambda_reduce_body( lambda_reduce_body& other, tbb::split )
387         : my_identity_element(other.my_identity_element)
388         , my_real_body(other.my_real_body)
389         , my_reduction(other.my_reduction)
390         , my_value(other.my_identity_element)
391     { }
392     void operator()(Range& range) {
393         my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
394     }
395     void join( lambda_reduce_body& rhs ) {
396         my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
397                                                      const_cast<const Value&>(rhs.my_value));
398     }
399     Value result() const {
400         return my_value;
401     }
402 };
403 
404 
405 // Requirements on Range concept are documented in blocked_range.h
406 
407 /** \page parallel_reduce_body_req Requirements on parallel_reduce body
408     Class \c Body implementing the concept of parallel_reduce body must define:
409     - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
410                                                         Must be able to run concurrently with operator() and method \c join
411     - \code Body::~Body(); \endcode                     Destructor
412     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
413                                                         and accumulating the result
414     - \code void Body::join( Body& b ); \endcode        Join results.
415                                                         The result in \c b should be merged into the result of \c this
416 **/
417 
418 /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
419     TO BE DOCUMENTED
420 **/
421 
422 /** \name parallel_reduce
423     See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
424 //@{
425 
426 //! Parallel iteration with reduction and default partitioner.
427 /** @ingroup algorithms **/
428 template<typename Range, typename Body>
429     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
430 void parallel_reduce( const Range& range, Body& body ) {
431     start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
432 }
433 
434 //! Parallel iteration with reduction and simple_partitioner
435 /** @ingroup algorithms **/
436 template<typename Range, typename Body>
437     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
438 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
439     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
440 }
441 
442 //! Parallel iteration with reduction and auto_partitioner
443 /** @ingroup algorithms **/
444 template<typename Range, typename Body>
445     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
446 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
447     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
448 }
449 
450 //! Parallel iteration with reduction and static_partitioner
451 /** @ingroup algorithms **/
452 template<typename Range, typename Body>
453     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
454 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
455     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner );
456 }
457 
458 //! Parallel iteration with reduction and affinity_partitioner
459 /** @ingroup algorithms **/
460 template<typename Range, typename Body>
461     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
462 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
463     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
464 }
465 
466 //! Parallel iteration with reduction, default partitioner and user-supplied context.
467 /** @ingroup algorithms **/
468 template<typename Range, typename Body>
469     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
470 void parallel_reduce( const Range& range, Body& body, task_group_context& context ) {
471     start_reduce<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
472 }
473 
474 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
475 /** @ingroup algorithms **/
476 template<typename Range, typename Body>
477     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
478 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
479     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
480 }
481 
482 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
483 /** @ingroup algorithms **/
484 template<typename Range, typename Body>
485     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
486 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
487     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
488 }
489 
490 //! Parallel iteration with reduction, static_partitioner and user-supplied context
491 /** @ingroup algorithms **/
492 template<typename Range, typename Body>
493     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
494 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
495     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner, context );
496 }
497 
498 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
499 /** @ingroup algorithms **/
500 template<typename Range, typename Body>
501     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
502 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
503     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
504 }
505 /** parallel_reduce overloads that work with anonymous function objects
506     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
507 
508 //! Parallel iteration with reduction and default partitioner.
509 /** @ingroup algorithms **/
510 template<typename Range, typename Value, typename RealBody, typename Reduction>
511     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
512                    parallel_reduce_combine<Reduction, Value>)
513 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
514     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
515     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
516                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
517     return body.result();
518 }
519 
520 //! Parallel iteration with reduction and simple_partitioner.
521 /** @ingroup algorithms **/
522 template<typename Range, typename Value, typename RealBody, typename Reduction>
523     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
524                    parallel_reduce_combine<Reduction, Value>)
525 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
526                        const simple_partitioner& partitioner ) {
527     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
528     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
529                           ::run(range, body, partitioner );
530     return body.result();
531 }
532 
533 //! Parallel iteration with reduction and auto_partitioner
534 /** @ingroup algorithms **/
535 template<typename Range, typename Value, typename RealBody, typename Reduction>
536     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
537                    parallel_reduce_combine<Reduction, Value>)
538 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
539                        const auto_partitioner& partitioner ) {
540     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
541     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
542                           ::run( range, body, partitioner );
543     return body.result();
544 }
545 
546 //! Parallel iteration with reduction and static_partitioner
547 /** @ingroup algorithms **/
548 template<typename Range, typename Value, typename RealBody, typename Reduction>
549     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
550                    parallel_reduce_combine<Reduction, Value>)
551 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
552                        const static_partitioner& partitioner ) {
553     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
554     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
555                                         ::run( range, body, partitioner );
556     return body.result();
557 }
558 
559 //! Parallel iteration with reduction and affinity_partitioner
560 /** @ingroup algorithms **/
561 template<typename Range, typename Value, typename RealBody, typename Reduction>
562     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
563                    parallel_reduce_combine<Reduction, Value>)
564 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
565                        affinity_partitioner& partitioner ) {
566     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
567     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
568                                         ::run( range, body, partitioner );
569     return body.result();
570 }
571 
572 //! Parallel iteration with reduction, default partitioner and user-supplied context.
573 /** @ingroup algorithms **/
574 template<typename Range, typename Value, typename RealBody, typename Reduction>
575     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
576                    parallel_reduce_combine<Reduction, Value>)
577 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
578                        task_group_context& context ) {
579     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
580     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
581                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
582     return body.result();
583 }
584 
585 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
586 /** @ingroup algorithms **/
587 template<typename Range, typename Value, typename RealBody, typename Reduction>
588     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
589                    parallel_reduce_combine<Reduction, Value>)
590 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
591                        const simple_partitioner& partitioner, task_group_context& context ) {
592     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
593     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
594                           ::run( range, body, partitioner, context );
595     return body.result();
596 }
597 
598 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
599 /** @ingroup algorithms **/
600 template<typename Range, typename Value, typename RealBody, typename Reduction>
601     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
602                    parallel_reduce_combine<Reduction, Value>)
603 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
604                        const auto_partitioner& partitioner, task_group_context& context ) {
605     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
606     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
607                           ::run( range, body, partitioner, context );
608     return body.result();
609 }
610 
611 //! Parallel iteration with reduction, static_partitioner and user-supplied context
612 /** @ingroup algorithms **/
613 template<typename Range, typename Value, typename RealBody, typename Reduction>
614     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
615                    parallel_reduce_combine<Reduction, Value>)
616 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
617                        const static_partitioner& partitioner, task_group_context& context ) {
618     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
619     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
620                                         ::run( range, body, partitioner, context );
621     return body.result();
622 }
623 
624 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
625 /** @ingroup algorithms **/
626 template<typename Range, typename Value, typename RealBody, typename Reduction>
627     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
628                    parallel_reduce_combine<Reduction, Value>)
629 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
630                        affinity_partitioner& partitioner, task_group_context& context ) {
631     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
632     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
633                                         ::run( range, body, partitioner, context );
634     return body.result();
635 }
636 
637 //! Parallel iteration with deterministic reduction and default simple partitioner.
638 /** @ingroup algorithms **/
639 template<typename Range, typename Body>
640     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
641 void parallel_deterministic_reduce( const Range& range, Body& body ) {
642     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, simple_partitioner());
643 }
644 
645 //! Parallel iteration with deterministic reduction and simple partitioner.
646 /** @ingroup algorithms **/
647 template<typename Range, typename Body>
648     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
649 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
650     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner);
651 }
652 
653 //! Parallel iteration with deterministic reduction and static partitioner.
654 /** @ingroup algorithms **/
655 template<typename Range, typename Body>
656     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
657 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
658     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner);
659 }
660 
661 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
662 /** @ingroup algorithms **/
663 template<typename Range, typename Body>
664     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
665 void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
666     start_deterministic_reduce<Range,Body, const simple_partitioner>::run( range, body, simple_partitioner(), context );
667 }
668 
669 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
670 /** @ingroup algorithms **/
671 template<typename Range, typename Body>
672     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
673 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
674     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner, context);
675 }
676 
677 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
678 /** @ingroup algorithms **/
679 template<typename Range, typename Body>
680     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
681 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
682     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner, context);
683 }
684 
685 /** parallel_reduce overloads that work with anonymous function objects
686     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
687 
688 //! Parallel iteration with deterministic reduction and default simple partitioner.
689 // TODO: consider making static_partitioner the default
690 /** @ingroup algorithms **/
691 template<typename Range, typename Value, typename RealBody, typename Reduction>
692     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
693                    parallel_reduce_combine<Reduction, Value>)
694 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
695     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner());
696 }
697 
698 //! Parallel iteration with deterministic reduction and simple partitioner.
699 /** @ingroup algorithms **/
700 template<typename Range, typename Value, typename RealBody, typename Reduction>
701     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
702                    parallel_reduce_combine<Reduction, Value>)
703 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) {
704     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
705     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
706                           ::run(range, body, partitioner);
707     return body.result();
708 }
709 
710 //! Parallel iteration with deterministic reduction and static partitioner.
711 /** @ingroup algorithms **/
712 template<typename Range, typename Value, typename RealBody, typename Reduction>
713     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
714                    parallel_reduce_combine<Reduction, Value>)
715 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) {
716     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
717     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
718         ::run(range, body, partitioner);
719     return body.result();
720 }
721 
722 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
723 /** @ingroup algorithms **/
724 template<typename Range, typename Value, typename RealBody, typename Reduction>
725     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
726                    parallel_reduce_combine<Reduction, Value>)
727 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
728     task_group_context& context ) {
729     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context);
730 }
731 
732 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
733 /** @ingroup algorithms **/
734 template<typename Range, typename Value, typename RealBody, typename Reduction>
735     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
736                    parallel_reduce_combine<Reduction, Value>)
737 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
738     const simple_partitioner& partitioner, task_group_context& context ) {
739     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
740     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
741         ::run(range, body, partitioner, context);
742     return body.result();
743 }
744 
745 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
746 /** @ingroup algorithms **/
747 template<typename Range, typename Value, typename RealBody, typename Reduction>
748     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
749                    parallel_reduce_combine<Reduction, Value>)
750 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
751     const static_partitioner& partitioner, task_group_context& context ) {
752     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
753     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
754         ::run(range, body, partitioner, context);
755     return body.result();
756 }
757 //@}
758 
759 } // namespace d1
760 } // namespace detail
761 
762 inline namespace v1 {
763 using detail::d1::parallel_reduce;
764 using detail::d1::parallel_deterministic_reduce;
765 // Split types
766 using detail::split;
767 using detail::proportional_split;
768 } // namespace v1
769 
770 } // namespace tbb
771 #endif /* __TBB_parallel_reduce_H */
772