xref: /oneTBB/include/oneapi/tbb/parallel_reduce.h (revision bb45f092)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_reduce_H
18 #define __TBB_parallel_reduce_H
19 
20 #include <new>
21 #include "detail/_namespace_injection.h"
22 #include "detail/_task.h"
23 #include "detail/_aligned_space.h"
24 #include "detail/_small_object_pool.h"
25 #include "detail/_range_common.h"
26 
27 #include "task_group.h" // task_group_context
28 #include "partitioner.h"
29 #include "profiling.h"
30 
31 namespace tbb {
32 namespace detail {
33 #if __TBB_CPP20_CONCEPTS_PRESENT
34 inline namespace d0 {
35 
36 template <typename Body, typename Range>
37 concept parallel_reduce_body = splittable<Body> &&
38                                requires( Body& body, const Range& range, Body& rhs ) {
39                                    body(range);
40                                    body.join(rhs);
41                                };
42 
43 template <typename Function, typename Range, typename Value>
44 concept parallel_reduce_function = requires( const std::remove_reference_t<Function>& func,
45                                              const Range& range,
46                                              const Value& value ) {
47     { func(range, value) } -> std::convertible_to<Value>;
48 };
49 
50 template <typename Combine, typename Value>
51 concept parallel_reduce_combine = requires( const std::remove_reference_t<Combine>& combine,
52                                             const Value& lhs, const Value& rhs ) {
53     { combine(lhs, rhs) } -> std::convertible_to<Value>;
54 };
55 
56 } // namespace d0
57 #endif // __TBB_CPP20_CONCEPTS_PRESENT
58 namespace d1 {
59 
60 //! Tree node type for parallel_reduce.
61 /** @ingroup algorithms */
62 //TODO: consider folding tree via bypass execution(instead of manual folding)
63 // for better cancellation and critical tasks handling (performance measurements required).
64 template<typename Body>
65 struct reduction_tree_node : public tree_node {
66     tbb::detail::aligned_space<Body> zombie_space;
67     Body& left_body;
68     bool has_right_zombie{false};
69 
70     reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
71         tree_node{parent, ref_count, alloc},
72         left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */
73     {}
74 
75     void join(task_group_context* context) {
76         if (has_right_zombie && !context->is_group_execution_cancelled())
77             left_body.join(*zombie_space.begin());
78     }
79 
80     ~reduction_tree_node() {
81         if( has_right_zombie ) zombie_space.begin()->~Body();
82     }
83 };
84 
85 //! Task type used to split the work of parallel_reduce.
86 /** @ingroup algorithms */
87 template<typename Range, typename Body, typename Partitioner>
88 struct start_reduce : public task {
89     Range my_range;
90     Body* my_body;
91     node* my_parent;
92 
93     typename Partitioner::task_partition_type my_partition;
94     small_object_allocator my_allocator;
95     bool is_right_child;
96 
97     task* execute(execution_data&) override;
98     task* cancel(execution_data&) override;
99     void finalize(const execution_data&);
100 
101     using tree_node_type = reduction_tree_node<Body>;
102 
103     //! Constructor reduce root task.
104     start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
105         my_range(range),
106         my_body(&body),
107         my_parent(nullptr),
108         my_partition(partitioner),
109         my_allocator(alloc),
110         is_right_child(false) {}
111     //! Splitting constructor used to generate children.
112     /** parent_ becomes left child. Newly constructed object is right child. */
113     start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
114         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
115         my_body(parent_.my_body),
116         my_parent(nullptr),
117         my_partition(parent_.my_partition, split_obj),
118         my_allocator(alloc),
119         is_right_child(true)
120     {
121         parent_.is_right_child = false;
122     }
123     //! Construct right child from the given range as response to the demand.
124     /** parent_ remains left child. Newly constructed object is right child. */
125     start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
126         my_range(r),
127         my_body(parent_.my_body),
128         my_parent(nullptr),
129         my_partition(parent_.my_partition, split()),
130         my_allocator(alloc),
131         is_right_child(true)
132     {
133         my_partition.align_depth( d );
134         parent_.is_right_child = false;
135     }
136     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
137         if ( !range.empty() ) {
138             wait_node wn;
139             small_object_allocator alloc{};
140             auto reduce_task = alloc.new_object<start_reduce>(range, body, partitioner, alloc);
141             reduce_task->my_parent = &wn;
142             execute_and_wait(*reduce_task, context, wn.m_wait, context);
143         }
144     }
145     static void run(const Range& range, Body& body, Partitioner& partitioner) {
146         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
147         // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block.
148         task_group_context context(PARALLEL_REDUCE);
149         run(range, body, partitioner, context);
150     }
151     //! Run body for range, serves as callback for partitioner
152     void run_body( Range &r ) {
153         (*my_body)(r);
154     }
155 
156     //! spawn right task, serves as callback for partitioner
157     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
158         offer_work_impl(ed, *this, split_obj);
159     }
160     //! spawn right task, serves as callback for partitioner
161     void offer_work(const Range& r, depth_t d, execution_data& ed) {
162         offer_work_impl(ed, *this, r, d);
163     }
164 
165 private:
166     template <typename... Args>
167     void offer_work_impl(execution_data& ed, Args&&... args) {
168         small_object_allocator alloc{};
169         // New right child
170         auto right_child = alloc.new_object<start_reduce>(ed, std::forward<Args>(args)..., alloc);
171 
172         // New root node as a continuation and ref count. Left and right child attach to the new parent.
173         right_child->my_parent = my_parent = alloc.new_object<tree_node_type>(ed, my_parent, 2, *my_body, alloc);
174 
175         // Spawn the right sibling
176         right_child->spawn_self(ed);
177     }
178 
179     void spawn_self(execution_data& ed) {
180         my_partition.spawn_task(*this, *context(ed));
181     }
182 };
183 
184 //! fold the tree and deallocate the task
185 template<typename Range, typename Body, typename Partitioner>
186 void start_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
187     // Get the current parent and wait object before an object destruction
188     node* parent = my_parent;
189     auto allocator = my_allocator;
190     // Task execution finished - destroy it
191     this->~start_reduce();
192     // Unwind the tree decrementing the parent`s reference count
193     fold_tree<tree_node_type>(parent, ed);
194     allocator.deallocate(this, ed);
195 }
196 
197 //! Execute parallel_reduce task
198 template<typename Range, typename Body, typename Partitioner>
199 task* start_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
200     if (!is_same_affinity(ed)) {
201         my_partition.note_affinity(execution_slot(ed));
202     }
203     my_partition.check_being_stolen(*this, ed);
204 
205     // The acquire barrier synchronizes the data pointed with my_body if the left
206     // task has already finished.
207     __TBB_ASSERT(my_parent, nullptr);
208     if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) {
209         tree_node_type* parent_ptr = static_cast<tree_node_type*>(my_parent);
210         my_body = static_cast<Body*>(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split()));
211         parent_ptr->has_right_zombie = true;
212     }
213     __TBB_ASSERT(my_body != nullptr, "Incorrect body value");
214 
215     my_partition.execute(*this, my_range, ed);
216 
217     finalize(ed);
218     return nullptr;
219 }
220 
221 //! Cancel parallel_reduce task
222 template<typename Range, typename Body, typename Partitioner>
223 task* start_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
224     finalize(ed);
225     return nullptr;
226 }
227 
228 //! Tree node type for parallel_deterministic_reduce.
229 /** @ingroup algorithms */
230 template<typename Body>
231 struct deterministic_reduction_tree_node : public tree_node {
232     Body right_body;
233     Body& left_body;
234 
235     deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
236         tree_node{parent, ref_count, alloc},
237         right_body{input_left_body, detail::split()},
238         left_body(input_left_body)
239     {}
240 
241     void join(task_group_context* context) {
242         if (!context->is_group_execution_cancelled())
243             left_body.join(right_body);
244     }
245 };
246 
247 //! Task type used to split the work of parallel_deterministic_reduce.
248 /** @ingroup algorithms */
249 template<typename Range, typename Body, typename Partitioner>
250 struct start_deterministic_reduce : public task {
251     Range my_range;
252     Body& my_body;
253     node* my_parent;
254 
255     typename Partitioner::task_partition_type my_partition;
256     small_object_allocator my_allocator;
257 
258     task* execute(execution_data&) override;
259     task* cancel(execution_data&) override;
260     void finalize(const execution_data&);
261 
262     using tree_node_type = deterministic_reduction_tree_node<Body>;
263 
264     //! Constructor deterministic_reduce root task.
265     start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) :
266         my_range(range),
267         my_body(body),
268         my_parent(nullptr),
269         my_partition(partitioner),
270         my_allocator(alloc) {}
271     //! Splitting constructor used to generate children.
272     /** parent_ becomes left child.  Newly constructed object is right child. */
273     start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
274                                 small_object_allocator& alloc ) :
275         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
276         my_body(body),
277         my_parent(nullptr),
278         my_partition(parent_.my_partition, split_obj),
279         my_allocator(alloc) {}
280     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
281         if ( !range.empty() ) {
282             wait_node wn;
283             small_object_allocator alloc{};
284             auto deterministic_reduce_task =
285                 alloc.new_object<start_deterministic_reduce>(range, partitioner, body, alloc);
286             deterministic_reduce_task->my_parent = &wn;
287             execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context);
288         }
289     }
290     static void run(const Range& range, Body& body, Partitioner& partitioner) {
291         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
292         // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce
293         // in the try-block.
294         task_group_context context(PARALLEL_REDUCE);
295         run(range, body, partitioner, context);
296     }
297     //! Run body for range, serves as callback for partitioner
298     void run_body( Range &r ) {
299         my_body( r );
300     }
301     //! Spawn right task, serves as callback for partitioner
302     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
303         offer_work_impl(ed, *this, split_obj);
304     }
305 private:
306     template <typename... Args>
307     void offer_work_impl(execution_data& ed, Args&&... args) {
308         small_object_allocator alloc{};
309         // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body.
310         auto new_tree_node = alloc.new_object<tree_node_type>(ed, my_parent, 2, my_body, alloc);
311 
312         // New right child
313         auto right_child = alloc.new_object<start_deterministic_reduce>(ed, std::forward<Args>(args)..., new_tree_node->right_body, alloc);
314 
315         right_child->my_parent = my_parent = new_tree_node;
316 
317         // Spawn the right sibling
318         right_child->spawn_self(ed);
319     }
320 
321     void spawn_self(execution_data& ed) {
322         my_partition.spawn_task(*this, *context(ed));
323     }
324 };
325 
326 //! Fold the tree and deallocate the task
327 template<typename Range, typename Body, typename Partitioner>
328 void start_deterministic_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
329     // Get the current parent and wait object before an object destruction
330     node* parent = my_parent;
331 
332     auto allocator = my_allocator;
333     // Task execution finished - destroy it
334     this->~start_deterministic_reduce();
335     // Unwind the tree decrementing the parent`s reference count
336     fold_tree<tree_node_type>(parent, ed);
337     allocator.deallocate(this, ed);
338 }
339 
340 //! Execute parallel_deterministic_reduce task
341 template<typename Range, typename Body, typename Partitioner>
342 task* start_deterministic_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
343     if (!is_same_affinity(ed)) {
344         my_partition.note_affinity(execution_slot(ed));
345     }
346     my_partition.check_being_stolen(*this, ed);
347 
348     my_partition.execute(*this, my_range, ed);
349 
350     finalize(ed);
351     return nullptr;
352 }
353 
354 //! Cancel parallel_deterministic_reduce task
355 template<typename Range, typename Body, typename Partitioner>
356 task* start_deterministic_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
357     finalize(ed);
358     return nullptr;
359 }
360 
361 
362 //! Auxiliary class for parallel_reduce; for internal use only.
363 /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
364     using given \ref parallel_reduce_lambda_req "anonymous function objects".
365  **/
366 /** @ingroup algorithms */
367 template<typename Range, typename Value, typename RealBody, typename Reduction>
368 class lambda_reduce_body {
369 //TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced
370 //       (might require some performance measurements)
371 
372     const Value&     my_identity_element;
373     const RealBody&  my_real_body;
374     const Reduction& my_reduction;
375     Value            my_value;
376     lambda_reduce_body& operator= ( const lambda_reduce_body& other );
377 public:
378     lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
379         : my_identity_element(identity)
380         , my_real_body(body)
381         , my_reduction(reduction)
382         , my_value(identity)
383     { }
384     lambda_reduce_body( const lambda_reduce_body& other ) = default;
385     lambda_reduce_body( lambda_reduce_body& other, tbb::split )
386         : my_identity_element(other.my_identity_element)
387         , my_real_body(other.my_real_body)
388         , my_reduction(other.my_reduction)
389         , my_value(other.my_identity_element)
390     { }
391     void operator()(Range& range) {
392         my_value = my_real_body(range, const_cast<const Value&>(my_value));
393     }
394     void join( lambda_reduce_body& rhs ) {
395         my_value = my_reduction(const_cast<const Value&>(my_value), const_cast<const Value&>(rhs.my_value));
396     }
397     Value result() const {
398         return my_value;
399     }
400 };
401 
402 
403 // Requirements on Range concept are documented in blocked_range.h
404 
405 /** \page parallel_reduce_body_req Requirements on parallel_reduce body
406     Class \c Body implementing the concept of parallel_reduce body must define:
407     - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
408                                                         Must be able to run concurrently with operator() and method \c join
409     - \code Body::~Body(); \endcode                     Destructor
410     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
411                                                         and accumulating the result
412     - \code void Body::join( Body& b ); \endcode        Join results.
413                                                         The result in \c b should be merged into the result of \c this
414 **/
415 
416 /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
417     TO BE DOCUMENTED
418 **/
419 
420 /** \name parallel_reduce
421     See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
422 //@{
423 
424 //! Parallel iteration with reduction and default partitioner.
425 /** @ingroup algorithms **/
426 template<typename Range, typename Body>
427     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
428 void parallel_reduce( const Range& range, Body& body ) {
429     start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
430 }
431 
432 //! Parallel iteration with reduction and simple_partitioner
433 /** @ingroup algorithms **/
434 template<typename Range, typename Body>
435     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
436 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
437     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
438 }
439 
440 //! Parallel iteration with reduction and auto_partitioner
441 /** @ingroup algorithms **/
442 template<typename Range, typename Body>
443     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
444 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
445     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
446 }
447 
448 //! Parallel iteration with reduction and static_partitioner
449 /** @ingroup algorithms **/
450 template<typename Range, typename Body>
451     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
452 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
453     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner );
454 }
455 
456 //! Parallel iteration with reduction and affinity_partitioner
457 /** @ingroup algorithms **/
458 template<typename Range, typename Body>
459     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
460 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
461     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
462 }
463 
464 //! Parallel iteration with reduction, default partitioner and user-supplied context.
465 /** @ingroup algorithms **/
466 template<typename Range, typename Body>
467     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
468 void parallel_reduce( const Range& range, Body& body, task_group_context& context ) {
469     start_reduce<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
470 }
471 
472 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
473 /** @ingroup algorithms **/
474 template<typename Range, typename Body>
475     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
476 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
477     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
478 }
479 
480 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
481 /** @ingroup algorithms **/
482 template<typename Range, typename Body>
483     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
484 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
485     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
486 }
487 
488 //! Parallel iteration with reduction, static_partitioner and user-supplied context
489 /** @ingroup algorithms **/
490 template<typename Range, typename Body>
491     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
492 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
493     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner, context );
494 }
495 
496 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
497 /** @ingroup algorithms **/
498 template<typename Range, typename Body>
499     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
500 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
501     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
502 }
503 /** parallel_reduce overloads that work with anonymous function objects
504     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
505 
506 //! Parallel iteration with reduction and default partitioner.
507 /** @ingroup algorithms **/
508 template<typename Range, typename Value, typename RealBody, typename Reduction>
509     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
510                    parallel_reduce_combine<Reduction, Value>)
511 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
512     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
513     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
514                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
515     return body.result();
516 }
517 
518 //! Parallel iteration with reduction and simple_partitioner.
519 /** @ingroup algorithms **/
520 template<typename Range, typename Value, typename RealBody, typename Reduction>
521     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
522                    parallel_reduce_combine<Reduction, Value>)
523 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
524                        const simple_partitioner& partitioner ) {
525     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
526     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
527                           ::run(range, body, partitioner );
528     return body.result();
529 }
530 
531 //! Parallel iteration with reduction and auto_partitioner
532 /** @ingroup algorithms **/
533 template<typename Range, typename Value, typename RealBody, typename Reduction>
534     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
535                    parallel_reduce_combine<Reduction, Value>)
536 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
537                        const auto_partitioner& partitioner ) {
538     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
539     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
540                           ::run( range, body, partitioner );
541     return body.result();
542 }
543 
544 //! Parallel iteration with reduction and static_partitioner
545 /** @ingroup algorithms **/
546 template<typename Range, typename Value, typename RealBody, typename Reduction>
547     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
548                    parallel_reduce_combine<Reduction, Value>)
549 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
550                        const static_partitioner& partitioner ) {
551     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
552     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
553                                         ::run( range, body, partitioner );
554     return body.result();
555 }
556 
557 //! Parallel iteration with reduction and affinity_partitioner
558 /** @ingroup algorithms **/
559 template<typename Range, typename Value, typename RealBody, typename Reduction>
560     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
561                    parallel_reduce_combine<Reduction, Value>)
562 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
563                        affinity_partitioner& partitioner ) {
564     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
565     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
566                                         ::run( range, body, partitioner );
567     return body.result();
568 }
569 
570 //! Parallel iteration with reduction, default partitioner and user-supplied context.
571 /** @ingroup algorithms **/
572 template<typename Range, typename Value, typename RealBody, typename Reduction>
573     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
574                    parallel_reduce_combine<Reduction, Value>)
575 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
576                        task_group_context& context ) {
577     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
578     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
579                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
580     return body.result();
581 }
582 
583 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
584 /** @ingroup algorithms **/
585 template<typename Range, typename Value, typename RealBody, typename Reduction>
586     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
587                    parallel_reduce_combine<Reduction, Value>)
588 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
589                        const simple_partitioner& partitioner, task_group_context& context ) {
590     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
591     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
592                           ::run( range, body, partitioner, context );
593     return body.result();
594 }
595 
596 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
597 /** @ingroup algorithms **/
598 template<typename Range, typename Value, typename RealBody, typename Reduction>
599     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
600                    parallel_reduce_combine<Reduction, Value>)
601 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
602                        const auto_partitioner& partitioner, task_group_context& context ) {
603     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
604     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
605                           ::run( range, body, partitioner, context );
606     return body.result();
607 }
608 
609 //! Parallel iteration with reduction, static_partitioner and user-supplied context
610 /** @ingroup algorithms **/
611 template<typename Range, typename Value, typename RealBody, typename Reduction>
612     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
613                    parallel_reduce_combine<Reduction, Value>)
614 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
615                        const static_partitioner& partitioner, task_group_context& context ) {
616     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
617     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
618                                         ::run( range, body, partitioner, context );
619     return body.result();
620 }
621 
622 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
623 /** @ingroup algorithms **/
624 template<typename Range, typename Value, typename RealBody, typename Reduction>
625     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
626                    parallel_reduce_combine<Reduction, Value>)
627 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
628                        affinity_partitioner& partitioner, task_group_context& context ) {
629     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
630     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
631                                         ::run( range, body, partitioner, context );
632     return body.result();
633 }
634 
635 //! Parallel iteration with deterministic reduction and default simple partitioner.
636 /** @ingroup algorithms **/
637 template<typename Range, typename Body>
638     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
639 void parallel_deterministic_reduce( const Range& range, Body& body ) {
640     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, simple_partitioner());
641 }
642 
643 //! Parallel iteration with deterministic reduction and simple partitioner.
644 /** @ingroup algorithms **/
645 template<typename Range, typename Body>
646     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
647 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
648     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner);
649 }
650 
651 //! Parallel iteration with deterministic reduction and static partitioner.
652 /** @ingroup algorithms **/
653 template<typename Range, typename Body>
654     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
655 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
656     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner);
657 }
658 
659 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
660 /** @ingroup algorithms **/
661 template<typename Range, typename Body>
662     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
663 void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
664     start_deterministic_reduce<Range,Body, const simple_partitioner>::run( range, body, simple_partitioner(), context );
665 }
666 
667 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
668 /** @ingroup algorithms **/
669 template<typename Range, typename Body>
670     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
671 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
672     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner, context);
673 }
674 
675 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
676 /** @ingroup algorithms **/
677 template<typename Range, typename Body>
678     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
679 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
680     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner, context);
681 }
682 
683 /** parallel_reduce overloads that work with anonymous function objects
684     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
685 
686 //! Parallel iteration with deterministic reduction and default simple partitioner.
687 // TODO: consider making static_partitioner the default
688 /** @ingroup algorithms **/
689 template<typename Range, typename Value, typename RealBody, typename Reduction>
690     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
691                    parallel_reduce_combine<Reduction, Value>)
692 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
693     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner());
694 }
695 
696 //! Parallel iteration with deterministic reduction and simple partitioner.
697 /** @ingroup algorithms **/
698 template<typename Range, typename Value, typename RealBody, typename Reduction>
699     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
700                    parallel_reduce_combine<Reduction, Value>)
701 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) {
702     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
703     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
704                           ::run(range, body, partitioner);
705     return body.result();
706 }
707 
708 //! Parallel iteration with deterministic reduction and static partitioner.
709 /** @ingroup algorithms **/
710 template<typename Range, typename Value, typename RealBody, typename Reduction>
711     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
712                    parallel_reduce_combine<Reduction, Value>)
713 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) {
714     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
715     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
716         ::run(range, body, partitioner);
717     return body.result();
718 }
719 
720 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
721 /** @ingroup algorithms **/
722 template<typename Range, typename Value, typename RealBody, typename Reduction>
723     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
724                    parallel_reduce_combine<Reduction, Value>)
725 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
726     task_group_context& context ) {
727     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context);
728 }
729 
730 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
731 /** @ingroup algorithms **/
732 template<typename Range, typename Value, typename RealBody, typename Reduction>
733     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
734                    parallel_reduce_combine<Reduction, Value>)
735 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
736     const simple_partitioner& partitioner, task_group_context& context ) {
737     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
738     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
739         ::run(range, body, partitioner, context);
740     return body.result();
741 }
742 
743 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
744 /** @ingroup algorithms **/
745 template<typename Range, typename Value, typename RealBody, typename Reduction>
746     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
747                    parallel_reduce_combine<Reduction, Value>)
748 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
749     const static_partitioner& partitioner, task_group_context& context ) {
750     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
751     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
752         ::run(range, body, partitioner, context);
753     return body.result();
754 }
755 //@}
756 
757 } // namespace d1
758 } // namespace detail
759 
760 inline namespace v1 {
761 using detail::d1::parallel_reduce;
762 using detail::d1::parallel_deterministic_reduce;
763 // Split types
764 using detail::split;
765 using detail::proportional_split;
766 } // namespace v1
767 
768 } // namespace tbb
769 #endif /* __TBB_parallel_reduce_H */
770