xref: /oneTBB/include/oneapi/tbb/parallel_reduce.h (revision 0a2b3987)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_parallel_reduce_H
18 #define __TBB_parallel_reduce_H
19 
20 #include <new>
21 #include "detail/_namespace_injection.h"
22 #include "detail/_task.h"
23 #include "detail/_aligned_space.h"
24 #include "detail/_small_object_pool.h"
25 #include "detail/_range_common.h"
26 
27 #include "task_group.h" // task_group_context
28 #include "partitioner.h"
29 #include "profiling.h"
30 
31 namespace tbb {
32 namespace detail {
33 #if __TBB_CPP20_CONCEPTS_PRESENT
34 inline namespace d0 {
35 
36 template <typename Body, typename Range>
37 concept parallel_reduce_body = splittable<Body> &&
38                                requires( Body& body, const Range& range, Body& rhs ) {
39                                    body(range);
40                                    body.join(rhs);
41                                };
42 
43 template <typename Function, typename Range, typename Value>
44 concept parallel_reduce_function = requires( const std::remove_reference_t<Function>& func,
45                                              const Range& range,
46                                              const Value& value ) {
47     { func(range, value) } -> std::convertible_to<Value>;
48 };
49 
50 template <typename Combine, typename Value>
51 concept parallel_reduce_combine = requires( const std::remove_reference_t<Combine>& combine,
52                                             const Value& lhs, const Value& rhs ) {
53     { combine(lhs, rhs) } -> std::convertible_to<Value>;
54 };
55 
56 } // namespace d0
57 #endif // __TBB_CPP20_CONCEPTS_PRESENT
58 namespace d1 {
59 
60 //! Tree node type for parallel_reduce.
61 /** @ingroup algorithms */
62 //TODO: consider folding tree via bypass execution(instead of manual folding)
63 // for better cancellation and critical tasks handling (performance measurements required).
64 template<typename Body>
65 struct reduction_tree_node : public tree_node {
66     tbb::detail::aligned_space<Body> zombie_space;
67     Body& left_body;
68     bool has_right_zombie{false};
69 
70     reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
71         tree_node{parent, ref_count, alloc},
72         left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */
73     {}
74 
75     void join(task_group_context* context) {
76         if (has_right_zombie && !context->is_group_execution_cancelled())
77             left_body.join(*zombie_space.begin());
78     }
79 
80     ~reduction_tree_node() {
81         if( has_right_zombie ) zombie_space.begin()->~Body();
82     }
83 };
84 
85 //! Task type used to split the work of parallel_reduce.
86 /** @ingroup algorithms */
87 template<typename Range, typename Body, typename Partitioner>
88 struct start_reduce : public task {
89     Range my_range;
90     Body* my_body;
91     node* my_parent;
92 
93     typename Partitioner::task_partition_type my_partition;
94     small_object_allocator my_allocator;
95     bool is_right_child;
96 
97     task* execute(execution_data&) override;
98     task* cancel(execution_data&) override;
99     void finalize(const execution_data&);
100 
101     using tree_node_type = reduction_tree_node<Body>;
102 
103     //! Constructor reduce root task.
104     start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
105         my_range(range),
106         my_body(&body),
107         my_partition(partitioner),
108         my_allocator(alloc),
109         is_right_child(false) {}
110     //! Splitting constructor used to generate children.
111     /** parent_ becomes left child. Newly constructed object is right child. */
112     start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
113         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
114         my_body(parent_.my_body),
115         my_partition(parent_.my_partition, split_obj),
116         my_allocator(alloc),
117         is_right_child(true)
118     {
119         parent_.is_right_child = false;
120     }
121     //! Construct right child from the given range as response to the demand.
122     /** parent_ remains left child. Newly constructed object is right child. */
123     start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
124         my_range(r),
125         my_body(parent_.my_body),
126         my_partition(parent_.my_partition, split()),
127         my_allocator(alloc),
128         is_right_child(true)
129     {
130         my_partition.align_depth( d );
131         parent_.is_right_child = false;
132     }
133     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
134         if ( !range.empty() ) {
135             wait_node wn;
136             small_object_allocator alloc{};
137             auto reduce_task = alloc.new_object<start_reduce>(range, body, partitioner, alloc);
138             reduce_task->my_parent = &wn;
139             execute_and_wait(*reduce_task, context, wn.m_wait, context);
140         }
141     }
142     static void run(const Range& range, Body& body, Partitioner& partitioner) {
143         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
144         // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block.
145         task_group_context context(PARALLEL_REDUCE);
146         run(range, body, partitioner, context);
147     }
148     //! Run body for range, serves as callback for partitioner
149     void run_body( Range &r ) {
150         (*my_body)(r);
151     }
152 
153     //! spawn right task, serves as callback for partitioner
154     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
155         offer_work_impl(ed, *this, split_obj);
156     }
157     //! spawn right task, serves as callback for partitioner
158     void offer_work(const Range& r, depth_t d, execution_data& ed) {
159         offer_work_impl(ed, *this, r, d);
160     }
161 
162 private:
163     template <typename... Args>
164     void offer_work_impl(execution_data& ed, Args&&... args) {
165         small_object_allocator alloc{};
166         // New right child
167         auto right_child = alloc.new_object<start_reduce>(ed, std::forward<Args>(args)..., alloc);
168 
169         // New root node as a continuation and ref count. Left and right child attach to the new parent.
170         right_child->my_parent = my_parent = alloc.new_object<tree_node_type>(ed, my_parent, 2, *my_body, alloc);
171 
172         // Spawn the right sibling
173         right_child->spawn_self(ed);
174     }
175 
176     void spawn_self(execution_data& ed) {
177         my_partition.spawn_task(*this, *context(ed));
178     }
179 };
180 
181 //! fold the tree and deallocate the task
182 template<typename Range, typename Body, typename Partitioner>
183 void start_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
184     // Get the current parent and wait object before an object destruction
185     node* parent = my_parent;
186     auto allocator = my_allocator;
187     // Task execution finished - destroy it
188     this->~start_reduce();
189     // Unwind the tree decrementing the parent`s reference count
190     fold_tree<tree_node_type>(parent, ed);
191     allocator.deallocate(this, ed);
192 }
193 
194 //! Execute parallel_reduce task
195 template<typename Range, typename Body, typename Partitioner>
196 task* start_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
197     if (!is_same_affinity(ed)) {
198         my_partition.note_affinity(execution_slot(ed));
199     }
200     my_partition.check_being_stolen(*this, ed);
201 
202     // The acquire barrier synchronizes the data pointed with my_body if the left
203     // task has already finished.
204     if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) {
205         tree_node_type* parent_ptr = static_cast<tree_node_type*>(my_parent);
206         my_body = (Body*) new( parent_ptr->zombie_space.begin() ) Body(*my_body, split());
207         parent_ptr->has_right_zombie = true;
208     }
209     __TBB_ASSERT(my_body != nullptr, "Incorrect body value");
210 
211     my_partition.execute(*this, my_range, ed);
212 
213     finalize(ed);
214     return nullptr;
215 }
216 
217 //! Cancel parallel_reduce task
218 template<typename Range, typename Body, typename Partitioner>
219 task* start_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
220     finalize(ed);
221     return nullptr;
222 }
223 
224 //! Tree node type for parallel_deterministic_reduce.
225 /** @ingroup algorithms */
226 template<typename Body>
227 struct deterministic_reduction_tree_node : public tree_node {
228     Body right_body;
229     Body& left_body;
230 
231     deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) :
232         tree_node{parent, ref_count, alloc},
233         right_body{input_left_body, detail::split()},
234         left_body(input_left_body)
235     {}
236 
237     void join(task_group_context* context) {
238         if (!context->is_group_execution_cancelled())
239             left_body.join(right_body);
240     }
241 };
242 
243 //! Task type used to split the work of parallel_deterministic_reduce.
244 /** @ingroup algorithms */
245 template<typename Range, typename Body, typename Partitioner>
246 struct start_deterministic_reduce : public task {
247     Range my_range;
248     Body& my_body;
249     node* my_parent;
250 
251     typename Partitioner::task_partition_type my_partition;
252     small_object_allocator my_allocator;
253 
254     task* execute(execution_data&) override;
255     task* cancel(execution_data&) override;
256     void finalize(const execution_data&);
257 
258     using tree_node_type = deterministic_reduction_tree_node<Body>;
259 
260     //! Constructor deterministic_reduce root task.
261     start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) :
262         my_range(range),
263         my_body(body),
264         my_partition(partitioner),
265         my_allocator(alloc) {}
266     //! Splitting constructor used to generate children.
267     /** parent_ becomes left child.  Newly constructed object is right child. */
268     start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
269                                 small_object_allocator& alloc ) :
270         my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
271         my_body(body),
272         my_partition(parent_.my_partition, split_obj),
273         my_allocator(alloc) {}
274     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
275         if ( !range.empty() ) {
276             wait_node wn;
277             small_object_allocator alloc{};
278             auto deterministic_reduce_task =
279                 alloc.new_object<start_deterministic_reduce>(range, partitioner, body, alloc);
280             deterministic_reduce_task->my_parent = &wn;
281             execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context);
282         }
283     }
284     static void run(const Range& range, Body& body, Partitioner& partitioner) {
285         // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
286         // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce
287         // in the try-block.
288         task_group_context context(PARALLEL_REDUCE);
289         run(range, body, partitioner, context);
290     }
291     //! Run body for range, serves as callback for partitioner
292     void run_body( Range &r ) {
293         my_body( r );
294     }
295     //! Spawn right task, serves as callback for partitioner
296     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
297         offer_work_impl(ed, *this, split_obj);
298     }
299 private:
300     template <typename... Args>
301     void offer_work_impl(execution_data& ed, Args&&... args) {
302         small_object_allocator alloc{};
303         // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body.
304         auto new_tree_node = alloc.new_object<tree_node_type>(ed, my_parent, 2, my_body, alloc);
305 
306         // New right child
307         auto right_child = alloc.new_object<start_deterministic_reduce>(ed, std::forward<Args>(args)..., new_tree_node->right_body, alloc);
308 
309         right_child->my_parent = my_parent = new_tree_node;
310 
311         // Spawn the right sibling
312         right_child->spawn_self(ed);
313     }
314 
315     void spawn_self(execution_data& ed) {
316         my_partition.spawn_task(*this, *context(ed));
317     }
318 };
319 
320 //! Fold the tree and deallocate the task
321 template<typename Range, typename Body, typename Partitioner>
322 void start_deterministic_reduce<Range, Body, Partitioner>::finalize(const execution_data& ed) {
323     // Get the current parent and wait object before an object destruction
324     node* parent = my_parent;
325 
326     auto allocator = my_allocator;
327     // Task execution finished - destroy it
328     this->~start_deterministic_reduce();
329     // Unwind the tree decrementing the parent`s reference count
330     fold_tree<tree_node_type>(parent, ed);
331     allocator.deallocate(this, ed);
332 }
333 
334 //! Execute parallel_deterministic_reduce task
335 template<typename Range, typename Body, typename Partitioner>
336 task* start_deterministic_reduce<Range,Body,Partitioner>::execute(execution_data& ed) {
337     if (!is_same_affinity(ed)) {
338         my_partition.note_affinity(execution_slot(ed));
339     }
340     my_partition.check_being_stolen(*this, ed);
341 
342     my_partition.execute(*this, my_range, ed);
343 
344     finalize(ed);
345     return NULL;
346 }
347 
348 //! Cancel parallel_deterministic_reduce task
349 template<typename Range, typename Body, typename Partitioner>
350 task* start_deterministic_reduce<Range, Body, Partitioner>::cancel(execution_data& ed) {
351     finalize(ed);
352     return NULL;
353 }
354 
355 
356 //! Auxiliary class for parallel_reduce; for internal use only.
357 /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body"
358     using given \ref parallel_reduce_lambda_req "anonymous function objects".
359  **/
360 /** @ingroup algorithms */
361 template<typename Range, typename Value, typename RealBody, typename Reduction>
362 class lambda_reduce_body {
363 //TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced
364 //       (might require some performance measurements)
365 
366     const Value&     my_identity_element;
367     const RealBody&  my_real_body;
368     const Reduction& my_reduction;
369     Value            my_value;
370     lambda_reduce_body& operator= ( const lambda_reduce_body& other );
371 public:
372     lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction )
373         : my_identity_element(identity)
374         , my_real_body(body)
375         , my_reduction(reduction)
376         , my_value(identity)
377     { }
378     lambda_reduce_body( const lambda_reduce_body& other ) = default;
379     lambda_reduce_body( lambda_reduce_body& other, tbb::split )
380         : my_identity_element(other.my_identity_element)
381         , my_real_body(other.my_real_body)
382         , my_reduction(other.my_reduction)
383         , my_value(other.my_identity_element)
384     { }
385     void operator()(Range& range) {
386         my_value = my_real_body(range, const_cast<const Value&>(my_value));
387     }
388     void join( lambda_reduce_body& rhs ) {
389         my_value = my_reduction(const_cast<const Value&>(my_value), const_cast<const Value&>(rhs.my_value));
390     }
391     Value result() const {
392         return my_value;
393     }
394 };
395 
396 
397 // Requirements on Range concept are documented in blocked_range.h
398 
399 /** \page parallel_reduce_body_req Requirements on parallel_reduce body
400     Class \c Body implementing the concept of parallel_reduce body must define:
401     - \code Body::Body( Body&, split ); \endcode        Splitting constructor.
402                                                         Must be able to run concurrently with operator() and method \c join
403     - \code Body::~Body(); \endcode                     Destructor
404     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
405                                                         and accumulating the result
406     - \code void Body::join( Body& b ); \endcode        Join results.
407                                                         The result in \c b should be merged into the result of \c this
408 **/
409 
410 /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions)
411     TO BE DOCUMENTED
412 **/
413 
414 /** \name parallel_reduce
415     See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/
416 //@{
417 
418 //! Parallel iteration with reduction and default partitioner.
419 /** @ingroup algorithms **/
420 template<typename Range, typename Body>
421     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
422 void parallel_reduce( const Range& range, Body& body ) {
423     start_reduce<Range,Body, const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER() );
424 }
425 
426 //! Parallel iteration with reduction and simple_partitioner
427 /** @ingroup algorithms **/
428 template<typename Range, typename Body>
429     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
430 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
431     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner );
432 }
433 
434 //! Parallel iteration with reduction and auto_partitioner
435 /** @ingroup algorithms **/
436 template<typename Range, typename Body>
437     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
438 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) {
439     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner );
440 }
441 
442 //! Parallel iteration with reduction and static_partitioner
443 /** @ingroup algorithms **/
444 template<typename Range, typename Body>
445     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
446 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
447     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner );
448 }
449 
450 //! Parallel iteration with reduction and affinity_partitioner
451 /** @ingroup algorithms **/
452 template<typename Range, typename Body>
453     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
454 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) {
455     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner );
456 }
457 
458 //! Parallel iteration with reduction, default partitioner and user-supplied context.
459 /** @ingroup algorithms **/
460 template<typename Range, typename Body>
461     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
462 void parallel_reduce( const Range& range, Body& body, task_group_context& context ) {
463     start_reduce<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
464 }
465 
466 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
467 /** @ingroup algorithms **/
468 template<typename Range, typename Body>
469     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
470 void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
471     start_reduce<Range,Body,const simple_partitioner>::run( range, body, partitioner, context );
472 }
473 
474 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
475 /** @ingroup algorithms **/
476 template<typename Range, typename Body>
477     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
478 void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
479     start_reduce<Range,Body,const auto_partitioner>::run( range, body, partitioner, context );
480 }
481 
482 //! Parallel iteration with reduction, static_partitioner and user-supplied context
483 /** @ingroup algorithms **/
484 template<typename Range, typename Body>
485     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
486 void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
487     start_reduce<Range,Body,const static_partitioner>::run( range, body, partitioner, context );
488 }
489 
490 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
491 /** @ingroup algorithms **/
492 template<typename Range, typename Body>
493     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
494 void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
495     start_reduce<Range,Body,affinity_partitioner>::run( range, body, partitioner, context );
496 }
497 /** parallel_reduce overloads that work with anonymous function objects
498     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
499 
500 //! Parallel iteration with reduction and default partitioner.
501 /** @ingroup algorithms **/
502 template<typename Range, typename Value, typename RealBody, typename Reduction>
503     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
504                    parallel_reduce_combine<Reduction, Value>)
505 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
506     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
507     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
508                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
509     return body.result();
510 }
511 
512 //! Parallel iteration with reduction and simple_partitioner.
513 /** @ingroup algorithms **/
514 template<typename Range, typename Value, typename RealBody, typename Reduction>
515     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
516                    parallel_reduce_combine<Reduction, Value>)
517 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
518                        const simple_partitioner& partitioner ) {
519     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
520     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
521                           ::run(range, body, partitioner );
522     return body.result();
523 }
524 
525 //! Parallel iteration with reduction and auto_partitioner
526 /** @ingroup algorithms **/
527 template<typename Range, typename Value, typename RealBody, typename Reduction>
528     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
529                    parallel_reduce_combine<Reduction, Value>)
530 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
531                        const auto_partitioner& partitioner ) {
532     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
533     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
534                           ::run( range, body, partitioner );
535     return body.result();
536 }
537 
538 //! Parallel iteration with reduction and static_partitioner
539 /** @ingroup algorithms **/
540 template<typename Range, typename Value, typename RealBody, typename Reduction>
541     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
542                    parallel_reduce_combine<Reduction, Value>)
543 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
544                        const static_partitioner& partitioner ) {
545     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
546     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
547                                         ::run( range, body, partitioner );
548     return body.result();
549 }
550 
551 //! Parallel iteration with reduction and affinity_partitioner
552 /** @ingroup algorithms **/
553 template<typename Range, typename Value, typename RealBody, typename Reduction>
554     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
555                    parallel_reduce_combine<Reduction, Value>)
556 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
557                        affinity_partitioner& partitioner ) {
558     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
559     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
560                                         ::run( range, body, partitioner );
561     return body.result();
562 }
563 
564 //! Parallel iteration with reduction, default partitioner and user-supplied context.
565 /** @ingroup algorithms **/
566 template<typename Range, typename Value, typename RealBody, typename Reduction>
567     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
568                    parallel_reduce_combine<Reduction, Value>)
569 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
570                        task_group_context& context ) {
571     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
572     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
573                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
574     return body.result();
575 }
576 
577 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
578 /** @ingroup algorithms **/
579 template<typename Range, typename Value, typename RealBody, typename Reduction>
580     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
581                    parallel_reduce_combine<Reduction, Value>)
582 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
583                        const simple_partitioner& partitioner, task_group_context& context ) {
584     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
585     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
586                           ::run( range, body, partitioner, context );
587     return body.result();
588 }
589 
590 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
591 /** @ingroup algorithms **/
592 template<typename Range, typename Value, typename RealBody, typename Reduction>
593     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
594                    parallel_reduce_combine<Reduction, Value>)
595 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
596                        const auto_partitioner& partitioner, task_group_context& context ) {
597     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
598     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
599                           ::run( range, body, partitioner, context );
600     return body.result();
601 }
602 
603 //! Parallel iteration with reduction, static_partitioner and user-supplied context
604 /** @ingroup algorithms **/
605 template<typename Range, typename Value, typename RealBody, typename Reduction>
606     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
607                    parallel_reduce_combine<Reduction, Value>)
608 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
609                        const static_partitioner& partitioner, task_group_context& context ) {
610     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
611     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
612                                         ::run( range, body, partitioner, context );
613     return body.result();
614 }
615 
616 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
617 /** @ingroup algorithms **/
618 template<typename Range, typename Value, typename RealBody, typename Reduction>
619     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
620                    parallel_reduce_combine<Reduction, Value>)
621 Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
622                        affinity_partitioner& partitioner, task_group_context& context ) {
623     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
624     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
625                                         ::run( range, body, partitioner, context );
626     return body.result();
627 }
628 
629 //! Parallel iteration with deterministic reduction and default simple partitioner.
630 /** @ingroup algorithms **/
631 template<typename Range, typename Body>
632     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
633 void parallel_deterministic_reduce( const Range& range, Body& body ) {
634     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, simple_partitioner());
635 }
636 
637 //! Parallel iteration with deterministic reduction and simple partitioner.
638 /** @ingroup algorithms **/
639 template<typename Range, typename Body>
640     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
641 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) {
642     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner);
643 }
644 
645 //! Parallel iteration with deterministic reduction and static partitioner.
646 /** @ingroup algorithms **/
647 template<typename Range, typename Body>
648     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
649 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) {
650     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner);
651 }
652 
653 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
654 /** @ingroup algorithms **/
655 template<typename Range, typename Body>
656     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
657 void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) {
658     start_deterministic_reduce<Range,Body, const simple_partitioner>::run( range, body, simple_partitioner(), context );
659 }
660 
661 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
662 /** @ingroup algorithms **/
663 template<typename Range, typename Body>
664     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
665 void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
666     start_deterministic_reduce<Range, Body, const simple_partitioner>::run(range, body, partitioner, context);
667 }
668 
669 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
670 /** @ingroup algorithms **/
671 template<typename Range, typename Body>
672     __TBB_requires(tbb_range<Range> && parallel_reduce_body<Body, Range>)
673 void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) {
674     start_deterministic_reduce<Range, Body, const static_partitioner>::run(range, body, partitioner, context);
675 }
676 
677 /** parallel_reduce overloads that work with anonymous function objects
678     (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/
679 
680 //! Parallel iteration with deterministic reduction and default simple partitioner.
681 // TODO: consider making static_partitioner the default
682 /** @ingroup algorithms **/
683 template<typename Range, typename Value, typename RealBody, typename Reduction>
684     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
685                    parallel_reduce_combine<Reduction, Value>)
686 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) {
687     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner());
688 }
689 
690 //! Parallel iteration with deterministic reduction and simple partitioner.
691 /** @ingroup algorithms **/
692 template<typename Range, typename Value, typename RealBody, typename Reduction>
693     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
694                    parallel_reduce_combine<Reduction, Value>)
695 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) {
696     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
697     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
698                           ::run(range, body, partitioner);
699     return body.result();
700 }
701 
702 //! Parallel iteration with deterministic reduction and static partitioner.
703 /** @ingroup algorithms **/
704 template<typename Range, typename Value, typename RealBody, typename Reduction>
705     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
706                    parallel_reduce_combine<Reduction, Value>)
707 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) {
708     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
709     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
710         ::run(range, body, partitioner);
711     return body.result();
712 }
713 
714 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
715 /** @ingroup algorithms **/
716 template<typename Range, typename Value, typename RealBody, typename Reduction>
717     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
718                    parallel_reduce_combine<Reduction, Value>)
719 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
720     task_group_context& context ) {
721     return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context);
722 }
723 
724 //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context.
725 /** @ingroup algorithms **/
726 template<typename Range, typename Value, typename RealBody, typename Reduction>
727     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
728                    parallel_reduce_combine<Reduction, Value>)
729 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
730     const simple_partitioner& partitioner, task_group_context& context ) {
731     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
732     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
733         ::run(range, body, partitioner, context);
734     return body.result();
735 }
736 
737 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
738 /** @ingroup algorithms **/
739 template<typename Range, typename Value, typename RealBody, typename Reduction>
740     __TBB_requires(tbb_range<Range> && parallel_reduce_function<RealBody, Range, Value> &&
741                    parallel_reduce_combine<Reduction, Value>)
742 Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction,
743     const static_partitioner& partitioner, task_group_context& context ) {
744     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
745     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
746         ::run(range, body, partitioner, context);
747     return body.result();
748 }
749 //@}
750 
751 } // namespace d1
752 } // namespace detail
753 
754 inline namespace v1 {
755 using detail::d1::parallel_reduce;
756 using detail::d1::parallel_deterministic_reduce;
757 // Split types
758 using detail::split;
759 using detail::proportional_split;
760 } // namespace v1
761 
762 } // namespace tbb
763 #endif /* __TBB_parallel_reduce_H */
764