xref: /oneTBB/include/oneapi/tbb/partitioner.h (revision c4a799df)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_partitioner_H
18 #define __TBB_partitioner_H
19 
20 #ifndef __TBB_INITIAL_CHUNKS
21 // initial task divisions per thread
22 #define __TBB_INITIAL_CHUNKS 2
23 #endif
24 #ifndef __TBB_RANGE_POOL_CAPACITY
25 // maximum number of elements in range pool
26 #define __TBB_RANGE_POOL_CAPACITY 8
27 #endif
28 #ifndef __TBB_INIT_DEPTH
29 // initial value for depth of range pool
30 #define __TBB_INIT_DEPTH 5
31 #endif
32 #ifndef __TBB_DEMAND_DEPTH_ADD
33 // when imbalance is found range splits this value times more
34 #define __TBB_DEMAND_DEPTH_ADD 1
35 #endif
36 
37 #include "detail/_config.h"
38 #include "detail/_namespace_injection.h"
39 #include "detail/_aligned_space.h"
40 #include "detail/_utils.h"
41 #include "detail/_template_helpers.h"
42 #include "detail/_range_common.h"
43 #include "detail/_task.h"
44 #include "detail/_small_object_pool.h"
45 
46 #include "cache_aligned_allocator.h"
47 #include "task_group.h" // task_group_context
48 #include "task_arena.h"
49 
50 #include <algorithm>
51 #include <atomic>
52 #include <type_traits>
53 
54 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
55     // Workaround for overzealous compiler warnings
56     #pragma warning (push)
57     #pragma warning (disable: 4244)
58 #endif
59 
60 namespace tbb {
61 namespace detail {
62 
63 namespace d1 {
64 class auto_partitioner;
65 class simple_partitioner;
66 class static_partitioner;
67 class affinity_partitioner;
68 class affinity_partition_type;
69 class affinity_partitioner_base;
70 
get_initial_auto_partitioner_divisor()71 inline std::size_t get_initial_auto_partitioner_divisor() {
72     const std::size_t factor = 4;
73     return factor * static_cast<std::size_t>(max_concurrency());
74 }
75 
76 //! Defines entry point for affinity partitioner into oneTBB run-time library.
77 class affinity_partitioner_base: no_copy {
78     friend class affinity_partitioner;
79     friend class affinity_partition_type;
80     //! Array that remembers affinities of tree positions to affinity_id.
81     /** nullptr if my_size==0. */
82     slot_id* my_array;
83     //! Number of elements in my_array.
84     std::size_t my_size;
85     //! Zeros the fields.
affinity_partitioner_base()86     affinity_partitioner_base() : my_array(nullptr), my_size(0) {}
87     //! Deallocates my_array.
~affinity_partitioner_base()88     ~affinity_partitioner_base() { resize(0); }
89     //! Resize my_array.
90     /** Retains values if resulting size is the same. */
resize(unsigned factor)91     void resize(unsigned factor) {
92         // Check factor to avoid asking for number of workers while there might be no arena.
93         unsigned max_threads_in_arena = static_cast<unsigned>(max_concurrency());
94         std::size_t new_size = factor ? factor * max_threads_in_arena : 0;
95         if (new_size != my_size) {
96             if (my_array) {
97                 r1::cache_aligned_deallocate(my_array);
98                 // Following two assignments must be done here for sake of exception safety.
99                 my_array = nullptr;
100                 my_size = 0;
101             }
102             if (new_size) {
103                 my_array = static_cast<slot_id*>(r1::cache_aligned_allocate(new_size * sizeof(slot_id)));
104                 std::fill_n(my_array, new_size, no_slot);
105                 my_size = new_size;
106             }
107         }
108     }
109 };
110 
111 template<typename Range, typename Body, typename Partitioner> struct start_for;
112 template<typename Range, typename Body, typename Partitioner> struct start_scan;
113 template<typename Range, typename Body, typename Partitioner> struct start_reduce;
114 template<typename Range, typename Body, typename Partitioner> struct start_deterministic_reduce;
115 
116 struct node {
117     node* my_parent{};
118     std::atomic<int> m_ref_count{};
119 
120     node() = default;
nodenode121     node(node* parent, int ref_count) :
122         my_parent{parent}, m_ref_count{ref_count} {
123         __TBB_ASSERT(ref_count > 0, "The ref count must be positive");
124     }
125 };
126 
127 struct wait_node : node {
wait_nodewait_node128     wait_node() : node{ nullptr, 1 } {}
129     wait_context m_wait{1};
130 };
131 
132 //! Join task node that contains shared flag for stealing feedback
133 struct tree_node : public node {
134     small_object_allocator m_allocator;
135     std::atomic<bool> m_child_stolen{false};
136 
tree_nodetree_node137     tree_node(node* parent, int ref_count, small_object_allocator& alloc)
138         : node{parent, ref_count}
139         , m_allocator{alloc} {}
140 
jointree_node141     void join(task_group_context*) {/*dummy, required only for reduction algorithms*/};
142 
143     template <typename Task>
mark_task_stolentree_node144     static void mark_task_stolen(Task &t) {
145         std::atomic<bool> &flag = static_cast<tree_node*>(t.my_parent)->m_child_stolen;
146 #if TBB_USE_PROFILING_TOOLS
147         // Threading tools respect lock prefix but report false-positive data-race via plain store
148         flag.exchange(true);
149 #else
150         flag.store(true, std::memory_order_relaxed);
151 #endif // TBB_USE_PROFILING_TOOLS
152     }
153     template <typename Task>
is_peer_stolentree_node154     static bool is_peer_stolen(Task &t) {
155         return static_cast<tree_node*>(t.my_parent)->m_child_stolen.load(std::memory_order_relaxed);
156     }
157 };
158 
159 // Context used to check cancellation state during reduction join process
160 template<typename TreeNodeType>
fold_tree(node * n,const execution_data & ed)161 void fold_tree(node* n, const execution_data& ed) {
162     for (;;) {
163         __TBB_ASSERT(n, nullptr);
164         __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive.");
165         call_itt_task_notify(releasing, n);
166         if (--n->m_ref_count > 0) {
167             return;
168         }
169         node* parent = n->my_parent;
170         if (!parent) {
171             break;
172         };
173 
174         call_itt_task_notify(acquired, n);
175         TreeNodeType* self = static_cast<TreeNodeType*>(n);
176         self->join(ed.context);
177         self->m_allocator.delete_object(self, ed);
178         n = parent;
179     }
180     // Finish parallel for execution when the root (last node) is reached
181     static_cast<wait_node*>(n)->m_wait.release();
182 }
183 
184 //! Depth is a relative depth of recursive division inside a range pool. Relative depth allows
185 //! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented
186 //! by a number that cannot fit into machine word.
187 typedef unsigned char depth_t;
188 
189 //! Range pool stores ranges of type T in a circular buffer with MaxCapacity
190 template <typename T, depth_t MaxCapacity>
191 class range_vector {
192     depth_t my_head;
193     depth_t my_tail;
194     depth_t my_size;
195     depth_t my_depth[MaxCapacity]; // relative depths of stored ranges
196     tbb::detail::aligned_space<T, MaxCapacity> my_pool;
197 
198 public:
199     //! initialize via first range in pool
range_vector(const T & elem)200     range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) {
201         my_depth[0] = 0;
202         new( static_cast<void *>(my_pool.begin()) ) T(elem);//TODO: std::move?
203     }
~range_vector()204     ~range_vector() {
205         while( !empty() ) pop_back();
206     }
empty()207     bool empty() const { return my_size == 0; }
size()208     depth_t size() const { return my_size; }
209     //! Populates range pool via ranges up to max depth or while divisible
210     //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces
split_to_fill(depth_t max_depth)211     void split_to_fill(depth_t max_depth) {
212         while( my_size < MaxCapacity && is_divisible(max_depth) ) {
213             depth_t prev = my_head;
214             my_head = (my_head + 1) % MaxCapacity;
215             new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move?
216             my_pool.begin()[prev].~T(); // instead of assignment
217             new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split
218             my_depth[my_head] = ++my_depth[prev];
219             my_size++;
220         }
221     }
pop_back()222     void pop_back() {
223         __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size");
224         my_pool.begin()[my_head].~T();
225         my_size--;
226         my_head = (my_head + MaxCapacity - 1) % MaxCapacity;
227     }
pop_front()228     void pop_front() {
229         __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size");
230         my_pool.begin()[my_tail].~T();
231         my_size--;
232         my_tail = (my_tail + 1) % MaxCapacity;
233     }
back()234     T& back() {
235         __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size");
236         return my_pool.begin()[my_head];
237     }
front()238     T& front() {
239         __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size");
240         return my_pool.begin()[my_tail];
241     }
242     //! similarly to front(), returns depth of the first range in the pool
front_depth()243     depth_t front_depth() {
244         __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size");
245         return my_depth[my_tail];
246     }
back_depth()247     depth_t back_depth() {
248         __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size");
249         return my_depth[my_head];
250     }
is_divisible(depth_t max_depth)251     bool is_divisible(depth_t max_depth) {
252         return back_depth() < max_depth && back().is_divisible();
253     }
254 };
255 
256 //! Provides default methods for partition objects and common algorithm blocks.
257 template <typename Partition>
258 struct partition_type_base {
259     typedef detail::split split_type;
260     // decision makers
note_affinitypartition_type_base261     void note_affinity( slot_id ) {}
262     template <typename Task>
check_being_stolenpartition_type_base263     bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range()
get_splitpartition_type_base264     template <typename Range> split_type get_split() { return split(); }
selfpartition_type_base265     Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper
266 
267     template<typename StartType, typename Range>
work_balancepartition_type_base268     void work_balance(StartType &start, Range &range, const execution_data&) {
269         start.run_body( range ); // static partitioner goes here
270     }
271 
272     template<typename StartType, typename Range>
executepartition_type_base273     void execute(StartType &start, Range &range, execution_data& ed) {
274         // The algorithm in a few words ([]-denotes calls to decision methods of partitioner):
275         // [If this task is stolen, adjust depth and divisions if necessary, set flag].
276         // If range is divisible {
277         //    Spread the work while [initial divisions left];
278         //    Create trap task [if necessary];
279         // }
280         // If not divisible or [max depth is reached], execute, else do the range pool part
281         if ( range.is_divisible() ) {
282             if ( self().is_divisible() ) {
283                 do { // split until is divisible
284                     typename Partition::split_type split_obj = self().template get_split<Range>();
285                     start.offer_work( split_obj, ed );
286                 } while ( range.is_divisible() && self().is_divisible() );
287             }
288         }
289         self().work_balance(start, range, ed);
290     }
291 };
292 
293 //! Provides default splitting strategy for partition objects.
294 template <typename Partition>
295 struct adaptive_mode : partition_type_base<Partition> {
296     typedef Partition my_partition;
297     std::size_t my_divisor;
298     // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves.
299     // A task which has only one index must produce the right split without reserved index in order to avoid
300     // it to be overwritten in note_affinity() of the created (right) task.
301     // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order)
302     static const unsigned factor = 1;
adaptive_modeadaptive_mode303     adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {}
adaptive_modeadaptive_mode304     adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {}
adaptive_modeadaptive_mode305     adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0)
306     {
307         // left blank as my_divisor gets overridden in the successors' constructors
308     }
309     /*! Override do_split methods in order to specify splitting strategy */
do_splitadaptive_mode310     std::size_t do_split(adaptive_mode &src, split) {
311         return src.my_divisor /= 2u;
312     }
313 };
314 
315 
316 //! Provides proportional splitting strategy for partition objects
317 template <typename Partition>
318 struct proportional_mode : adaptive_mode<Partition> {
319     typedef Partition my_partition;
320     using partition_type_base<Partition>::self; // CRTP helper to get access to derived classes
321 
proportional_modeproportional_mode322     proportional_mode() : adaptive_mode<Partition>() {}
proportional_modeproportional_mode323     proportional_mode(proportional_mode &src, split) : adaptive_mode<Partition>(src, split()) {}
proportional_modeproportional_mode324     proportional_mode(proportional_mode &src, const proportional_split& split_obj)
325         : adaptive_mode<Partition>(src, split_obj)
326     {
327         self().my_divisor = do_split(src, split_obj);
328     }
do_splitproportional_mode329     std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) {
330         std::size_t portion = split_obj.right() * my_partition::factor;
331         portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor);
332         src.my_divisor -= portion;
333         return portion;
334     }
is_divisibleproportional_mode335     bool is_divisible() { // part of old should_execute_range()
336         return self().my_divisor > my_partition::factor;
337     }
338     template <typename Range>
get_splitproportional_mode339     proportional_split get_split() {
340         // Create the proportion from partitioner internal resources (threads) that would be used:
341         // - into proportional_mode constructor to split the partitioner
342         // - if Range supports the proportional_split constructor it would use proposed proportion,
343         //   otherwise, the tbb::proportional_split object will be implicitly (for Range implementer)
344         //   casted to tbb::split
345 
346         std::size_t n = self().my_divisor / my_partition::factor;
347         std::size_t right = n / 2;
348         std::size_t left  = n - right;
349         return proportional_split(left, right);
350     }
351 };
352 
get_initial_partition_head()353 static std::size_t get_initial_partition_head() {
354     int current_index = tbb::this_task_arena::current_thread_index();
355     if (current_index == tbb::task_arena::not_initialized)
356         current_index = 0;
357     return size_t(current_index);
358 }
359 
360 //! Provides default linear indexing of partitioner's sequence
361 template <typename Partition>
362 struct linear_affinity_mode : proportional_mode<Partition> {
363     std::size_t my_head;
364     std::size_t my_max_affinity;
365     using proportional_mode<Partition>::self;
linear_affinity_modelinear_affinity_mode366     linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()),
367                              my_max_affinity(self().my_divisor) {}
linear_affinity_modelinear_affinity_mode368     linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split())
369         , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
linear_affinity_modelinear_affinity_mode370     linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj)
371         , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
spawn_tasklinear_affinity_mode372     void spawn_task(task& t, task_group_context& ctx) {
373         if (self().my_divisor) {
374             spawn(t, ctx, slot_id(my_head));
375         } else {
376             spawn(t, ctx);
377         }
378     }
379 };
380 
is_stolen_task(const execution_data & ed)381 static bool is_stolen_task(const execution_data& ed) {
382     return execution_slot(ed) != original_slot(ed);
383 }
384 
385 /*! Determine work-balance phase implementing splitting & stealing actions */
386 template<class Mode>
387 struct dynamic_grainsize_mode : Mode {
388     using Mode::self;
389     enum {
390         begin = 0,
391         run,
392         pass
393     } my_delay;
394     depth_t my_max_depth;
395     static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY;
dynamic_grainsize_modedynamic_grainsize_mode396     dynamic_grainsize_mode(): Mode()
397         , my_delay(begin)
398         , my_max_depth(__TBB_INIT_DEPTH) {}
dynamic_grainsize_modedynamic_grainsize_mode399     dynamic_grainsize_mode(dynamic_grainsize_mode& p, split)
400         : Mode(p, split())
401         , my_delay(pass)
402         , my_max_depth(p.my_max_depth) {}
dynamic_grainsize_modedynamic_grainsize_mode403     dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj)
404         : Mode(p, split_obj)
405         , my_delay(begin)
406         , my_max_depth(p.my_max_depth) {}
407     template <typename Task>
check_being_stolendynamic_grainsize_mode408     bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range()
409         if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree
410             self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)?
411             if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task
412 #if __TBB_USE_OPTIONAL_RTTI
413                 // RTTI is available, check whether the cast is valid
414                 // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast<tree_node*>(t.m_parent), 0);
415                 // correctness of the cast relies on avoiding the root task for which:
416                 // - initial value of my_divisor != 0 (protected by separate assertion)
417                 // - is_stolen_task() always returns false for the root task.
418 #endif
419                 tree_node::mark_task_stolen(t);
420                 if( !my_max_depth ) my_max_depth++;
421                 my_max_depth += __TBB_DEMAND_DEPTH_ADD;
422                 return true;
423             }
424         }
425         return false;
426     }
max_depthdynamic_grainsize_mode427     depth_t max_depth() { return my_max_depth; }
align_depthdynamic_grainsize_mode428     void align_depth(depth_t base) {
429         __TBB_ASSERT(base <= my_max_depth, nullptr);
430         my_max_depth -= base;
431     }
432     template<typename StartType, typename Range>
work_balancedynamic_grainsize_mode433     void work_balance(StartType &start, Range &range, execution_data& ed) {
434         if( !range.is_divisible() || !self().max_depth() ) {
435             start.run_body( range );
436         }
437         else { // do range pool
438             range_vector<Range, range_pool_size> range_pool(range);
439             do {
440                 range_pool.split_to_fill(self().max_depth()); // fill range pool
441                 if( self().check_for_demand( start ) ) {
442                     if( range_pool.size() > 1 ) {
443                         start.offer_work( range_pool.front(), range_pool.front_depth(), ed );
444                         range_pool.pop_front();
445                         continue;
446                     }
447                     if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task
448                         continue; // note: next split_to_fill() should split range at least once
449                 }
450                 start.run_body( range_pool.back() );
451                 range_pool.pop_back();
452             } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() );
453         }
454     }
455     template <typename Task>
check_for_demanddynamic_grainsize_mode456     bool check_for_demand(Task& t) {
457         if ( pass == my_delay ) {
458             if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array
459                 return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more
460             else if ( self().my_divisor && my_max_depth ) { // make balancing task
461                 self().my_divisor = 0; // once for each task; depth will be decreased in align_depth()
462                 return true;
463             }
464             else if ( tree_node::is_peer_stolen(t) ) {
465                 my_max_depth += __TBB_DEMAND_DEPTH_ADD;
466                 return true;
467             }
468         } else if( begin == my_delay ) {
469             my_delay = pass;
470         }
471         return false;
472     }
473 };
474 
475 class auto_partition_type: public dynamic_grainsize_mode<adaptive_mode<auto_partition_type> > {
476 public:
auto_partition_type(const auto_partitioner &)477     auto_partition_type( const auto_partitioner& ) {
478         my_divisor *= __TBB_INITIAL_CHUNKS;
479     }
auto_partition_type(auto_partition_type & src,split)480     auto_partition_type( auto_partition_type& src, split)
481         : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >(src, split()) {}
is_divisible()482     bool is_divisible() { // part of old should_execute_range()
483         if( my_divisor > 1 ) return true;
484         if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead
485             // keep same fragmentation while splitting for the local task pool
486             my_max_depth--;
487             my_divisor = 0; // decrease max_depth once per task
488             return true;
489         } else return false;
490     }
491     template <typename Task>
check_for_demand(Task & t)492     bool check_for_demand(Task& t) {
493         if (tree_node::is_peer_stolen(t)) {
494             my_max_depth += __TBB_DEMAND_DEPTH_ADD;
495             return true;
496         } else return false;
497     }
spawn_task(task & t,task_group_context & ctx)498     void spawn_task(task& t, task_group_context& ctx) {
499         spawn(t, ctx);
500     }
501 };
502 
503 class simple_partition_type: public partition_type_base<simple_partition_type> {
504 public:
simple_partition_type(const simple_partitioner &)505     simple_partition_type( const simple_partitioner& ) {}
simple_partition_type(const simple_partition_type &,split)506     simple_partition_type( const simple_partition_type&, split ) {}
507     //! simplified algorithm
508     template<typename StartType, typename Range>
execute(StartType & start,Range & range,execution_data & ed)509     void execute(StartType &start, Range &range, execution_data& ed) {
510         split_type split_obj = split(); // start.offer_work accepts split_type as reference
511         while( range.is_divisible() )
512             start.offer_work( split_obj, ed );
513         start.run_body( range );
514     }
spawn_task(task & t,task_group_context & ctx)515     void spawn_task(task& t, task_group_context& ctx) {
516         spawn(t, ctx);
517     }
518 };
519 
520 class static_partition_type : public linear_affinity_mode<static_partition_type> {
521 public:
522     typedef detail::proportional_split split_type;
static_partition_type(const static_partitioner &)523     static_partition_type( const static_partitioner& ) {}
static_partition_type(static_partition_type & p,const proportional_split & split_obj)524     static_partition_type( static_partition_type& p, const proportional_split& split_obj )
525         : linear_affinity_mode<static_partition_type>(p, split_obj) {}
526 };
527 
528 class affinity_partition_type : public dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> > {
529     static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units
530     slot_id* my_array;
531 public:
532     static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task
533     typedef detail::proportional_split split_type;
affinity_partition_type(affinity_partitioner_base & ap)534     affinity_partition_type( affinity_partitioner_base& ap ) {
535         __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" );
536         ap.resize(factor);
537         my_array = ap.my_array;
538         my_max_depth = factor_power + 1;
539         __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, nullptr );
540     }
affinity_partition_type(affinity_partition_type & p,split)541     affinity_partition_type(affinity_partition_type& p, split)
542         : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split())
543         , my_array(p.my_array) {}
affinity_partition_type(affinity_partition_type & p,const proportional_split & split_obj)544     affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj)
545         : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split_obj)
546         , my_array(p.my_array) {}
note_affinity(slot_id id)547     void note_affinity(slot_id id) {
548         if( my_divisor )
549             my_array[my_head] = id;
550     }
spawn_task(task & t,task_group_context & ctx)551     void spawn_task(task& t, task_group_context& ctx) {
552         if (my_divisor) {
553             if (!my_array[my_head]) {
554                 // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse
555                 spawn(t, ctx, slot_id(my_head / factor));
556             } else {
557                 spawn(t, ctx, my_array[my_head]);
558             }
559         } else {
560             spawn(t, ctx);
561         }
562     }
563 };
564 
565 //! A simple partitioner
566 /** Divides the range until the range is not divisible.
567     @ingroup algorithms */
568 class simple_partitioner {
569 public:
simple_partitioner()570     simple_partitioner() {}
571 private:
572     template<typename Range, typename Body, typename Partitioner> friend struct start_for;
573     template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
574     template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
575     template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
576     // new implementation just extends existing interface
577     typedef simple_partition_type task_partition_type;
578     // TODO: consider to make split_type public
579     typedef simple_partition_type::split_type split_type;
580 
581     // for parallel_scan only
582     class partition_type {
583     public:
should_execute_range(const execution_data &)584         bool should_execute_range(const execution_data& ) {return false;}
partition_type(const simple_partitioner &)585         partition_type( const simple_partitioner& ) {}
partition_type(const partition_type &,split)586         partition_type( const partition_type&, split ) {}
587     };
588 };
589 
590 //! An auto partitioner
591 /** The range is initial divided into several large chunks.
592     Chunks are further subdivided into smaller pieces if demand detected and they are divisible.
593     @ingroup algorithms */
594 class auto_partitioner {
595 public:
auto_partitioner()596     auto_partitioner() {}
597 
598 private:
599     template<typename Range, typename Body, typename Partitioner> friend struct start_for;
600     template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
601     template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
602     template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
603     // new implementation just extends existing interface
604     typedef auto_partition_type task_partition_type;
605     // TODO: consider to make split_type public
606     typedef auto_partition_type::split_type split_type;
607 
608     //! Backward-compatible partition for auto and affinity partition objects.
609     class partition_type {
610         size_t num_chunks;
611         static const size_t VICTIM_CHUNKS = 4;
612         public:
should_execute_range(const execution_data & ed)613         bool should_execute_range(const execution_data& ed) {
614             if( num_chunks<VICTIM_CHUNKS && is_stolen_task(ed) )
615                 num_chunks = VICTIM_CHUNKS;
616             return num_chunks==1;
617         }
partition_type(const auto_partitioner &)618         partition_type( const auto_partitioner& )
619             : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
partition_type(partition_type & pt,split)620         partition_type( partition_type& pt, split ) {
621             num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u;
622         }
623     };
624 };
625 
626 //! A static partitioner
627 class static_partitioner {
628 public:
static_partitioner()629     static_partitioner() {}
630 private:
631     template<typename Range, typename Body, typename Partitioner> friend struct start_for;
632     template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
633     template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
634     template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
635     // new implementation just extends existing interface
636     typedef static_partition_type task_partition_type;
637     // TODO: consider to make split_type public
638     typedef static_partition_type::split_type split_type;
639 };
640 
641 //! An affinity partitioner
642 class affinity_partitioner : affinity_partitioner_base {
643 public:
affinity_partitioner()644     affinity_partitioner() {}
645 
646 private:
647     template<typename Range, typename Body, typename Partitioner> friend struct start_for;
648     template<typename Range, typename Body, typename Partitioner> friend struct start_reduce;
649     template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce;
650     template<typename Range, typename Body, typename Partitioner> friend struct start_scan;
651     // new implementation just extends existing interface
652     typedef affinity_partition_type task_partition_type;
653     // TODO: consider to make split_type public
654     typedef affinity_partition_type::split_type split_type;
655 };
656 
657 } // namespace d1
658 } // namespace detail
659 
660 inline namespace v1 {
661 // Partitioners
662 using detail::d1::auto_partitioner;
663 using detail::d1::simple_partitioner;
664 using detail::d1::static_partitioner;
665 using detail::d1::affinity_partitioner;
666 // Split types
667 using detail::split;
668 using detail::proportional_split;
669 } // namespace v1
670 
671 } // namespace tbb
672 
673 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
674     #pragma warning (pop)
675 #endif // warning 4244 is back
676 
677 #undef __TBB_INITIAL_CHUNKS
678 #undef __TBB_RANGE_POOL_CAPACITY
679 #undef __TBB_INIT_DEPTH
680 
681 #endif /* __TBB_partitioner_H */
682