xref: /oneTBB/src/tbb/task_group_context.cpp (revision fa3268c3)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "oneapi/tbb/detail/_config.h"
18 #include "oneapi/tbb/tbb_allocator.h"
19 #include "oneapi/tbb/task_group.h"
20 #include "governor.h"
21 #include "thread_data.h"
22 #include "scheduler_common.h"
23 #include "itt_notify.h"
24 #include "task_dispatcher.h"
25 
26 #include <type_traits>
27 
28 namespace tbb {
29 namespace detail {
30 namespace r1 {
31 
32 //------------------------------------------------------------------------
33 // tbb_exception_ptr
34 //------------------------------------------------------------------------
35 tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
36     tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
37     return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
38 }
39 
40 void tbb_exception_ptr::destroy() noexcept {
41     this->~tbb_exception_ptr();
42     deallocate_memory(this);
43 }
44 
45 void tbb_exception_ptr::throw_self() {
46     if (governor::rethrow_exception_broken()) fix_broken_rethrow();
47     std::rethrow_exception(my_ptr);
48 }
49 
50 //------------------------------------------------------------------------
51 // task_group_context
52 //------------------------------------------------------------------------
53 
54 void task_group_context_impl::destroy(d1::task_group_context& ctx) {
55     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
56 
57     auto ctx_lifetime_state = ctx.my_lifetime_state.load(std::memory_order_relaxed);
58     __TBB_ASSERT(ctx_lifetime_state != d1::task_group_context::lifetime_state::locked, nullptr);
59 
60     if (ctx_lifetime_state == d1::task_group_context::lifetime_state::bound) {
61         // The owner can be destroyed at any moment. Access the associate data with caution.
62         thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed);
63         if (governor::is_thread_data_set(owner)) {
64             thread_data::context_list_state& cls = owner->my_context_list_state;
65             // We are the owner, so cls is valid.
66             // Local update of the context list
67             std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_acquire);
68             // The sequentially-consistent store to prevent load of nonlocal update flag
69             // from being hoisted before the store to local update flag.
70             cls.local_update = 1;
71             if (cls.nonlocal_update.load(std::memory_order_acquire)) {
72                 spin_mutex::scoped_lock lock(cls.mutex);
73                 ctx.my_node.remove_relaxed();
74                 cls.local_update.store(0, std::memory_order_relaxed);
75             } else {
76                 ctx.my_node.remove_relaxed();
77                 // Release fence is necessary so that update of our neighbors in
78                 // the context list was committed when possible concurrent destroyer
79                 // proceeds after local update flag is reset by the following store.
80                 cls.local_update.store(0, std::memory_order_release);
81                 if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
82                     // Another thread was propagating cancellation request when we removed
83                     // ourselves from the list. We must ensure that it is not accessing us
84                     // when this destructor finishes. We'll be able to acquire the lock
85                     // below only after the other thread finishes with us.
86                     spin_mutex::scoped_lock lock(cls.mutex);
87                 } else {
88                     // TODO: simplify exception propagation mechanism
89                     std::atomic_thread_fence(std::memory_order_release);
90                 }
91             }
92         } else {
93             d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound;
94             if (
95 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
96                 !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
97                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected,
98                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
99 #else
100                 !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked)
101 #endif
102                 ) {
103                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr);
104                 // The "owner" local variable can be a dangling pointer here. Do not access it.
105                 owner = nullptr;
106                 spin_wait_until_eq(ctx.my_owner, nullptr);
107                 // It is unsafe to remove the node because its neighbors might be already destroyed.
108                 // TODO: reconsider the logic.
109                 // ctx.my_node.remove_relaxed();
110             }
111             else {
112                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr);
113                 __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr);
114                 thread_data::context_list_state& cls = owner->my_context_list_state;
115                 __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive.");
116 
117                 ++cls.nonlocal_update;
118                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release);
119                 spin_wait_until_eq(cls.local_update, 0u);
120                 {
121                     spin_mutex::scoped_lock lock(cls.mutex);
122                     ctx.my_node.remove_relaxed();
123                 }
124                 --cls.nonlocal_update;
125             }
126         }
127     }
128 
129     if (ctx_lifetime_state == d1::task_group_context::lifetime_state::detached) {
130         spin_wait_until_eq(ctx.my_owner, nullptr);
131     }
132 
133     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
134 #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
135     suppress_unused_warning(ctl);
136 #endif
137     ctl->~cpu_ctl_env();
138 
139     if (ctx.my_exception)
140         ctx.my_exception->destroy();
141     ITT_STACK_DESTROY(ctx.my_itt_caller);
142 
143     poison_pointer(ctx.my_parent);
144     poison_pointer(ctx.my_owner);
145     poison_pointer(ctx.my_node.next);
146     poison_pointer(ctx.my_node.prev);
147     poison_pointer(ctx.my_exception);
148     poison_pointer(ctx.my_itt_caller);
149 }
150 
151 void task_group_context_impl::initialize(d1::task_group_context& ctx) {
152     ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
153 
154     ctx.my_cpu_ctl_env = 0;
155     ctx.my_cancellation_requested = 0;
156     ctx.my_state.store(0, std::memory_order_relaxed);
157     // Set the created state to bound at the first usage.
158     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
159     ctx.my_parent = nullptr;
160     ctx.my_owner = nullptr;
161     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
162     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
163     ctx.my_exception = nullptr;
164     ctx.my_itt_caller = nullptr;
165 
166     static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
167     d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
168     if (ctx.my_traits.fp_settings)
169         ctl->get_env();
170 }
171 
172 void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
173     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
174     __TBB_ASSERT(td, NULL);
175     ctx.my_owner.store(td, std::memory_order_relaxed);
176     thread_data::context_list_state& cls = td->my_context_list_state;
177     // state propagation logic assumes new contexts are bound to head of the list
178     ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed);
179     // Notify threads that may be concurrently destroying contexts registered
180     // in this scheduler's list that local list update is underway.
181     // Prevent load of global propagation epoch counter from being hoisted before
182     // speculative stores above, as well as load of nonlocal update flag from
183     // being hoisted before the store to local update flag.
184     cls.local_update = 1;
185     // Finalize local context list update
186     if (cls.nonlocal_update.load(std::memory_order_acquire)) {
187         spin_mutex::scoped_lock lock(cls.mutex);
188         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
189         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
190         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
191         cls.local_update.store(0, std::memory_order_relaxed);
192         cls.head.next.store(&ctx.my_node, std::memory_order_relaxed);
193     } else {
194         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
195         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
196         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
197         cls.local_update.store(0, std::memory_order_release);
198         // Thread-local list of contexts allows concurrent traversal by another thread
199         // while propagating state change. To ensure visibility of ctx.my_node's members
200         // to the concurrently traversing thread, the list's head is updated by means
201         // of store-with-release.
202         cls.head.next.store(&ctx.my_node, std::memory_order_release);
203     }
204 }
205 
206 void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
207     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
208     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
209     __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
210 
211     ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
212     __TBB_ASSERT(ctx.my_parent, NULL);
213 
214     // Inherit FPU settings only if the context has not captured FPU settings yet.
215     if (!ctx.my_traits.fp_settings)
216         copy_fp_settings(ctx, *ctx.my_parent);
217 
218     // Condition below prevents unnecessary thrashing parent context's cache line
219     if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
220         ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
221     }
222     if (ctx.my_parent->my_parent) {
223         // Even if this context were made accessible for state change propagation
224         // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
225         // above), it still could be missed if state propagation from a grand-ancestor
226         // was underway concurrently with binding.
227         // Speculative propagation from the parent together with epoch counters
228         // detecting possibility of such a race allow to avoid taking locks when
229         // there is no contention.
230 
231         // Acquire fence is necessary to prevent reordering subsequent speculative
232         // loads of parent state data out of the scope where epoch counters comparison
233         // can reliably validate it.
234         uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire);
235         // Speculative propagation of parent's state. The speculation will be
236         // validated by the epoch counters check further on.
237         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
238         register_with(ctx, td); // Issues full fence
239 
240         // If no state propagation was detected by the following condition, the above
241         // full fence guarantees that the parent had correct state during speculative
242         // propagation before the fence. Otherwise the propagation from parent is
243         // repeated under the lock.
244         if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
245             // Another thread may be propagating state change right now. So resort to lock.
246             context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
247             ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
248         }
249     } else {
250         register_with(ctx, td); // Issues full fence
251         // As we do not have grand-ancestors, concurrent state propagation (if any)
252         // may originate only from the parent context, and thus it is safe to directly
253         // copy the state from it.
254         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
255     }
256 
257     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
258 }
259 
260 void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
261     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
262     d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
263     if (state <= d1::task_group_context::lifetime_state::locked) {
264         if (state == d1::task_group_context::lifetime_state::created &&
265 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
266             ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
267             (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
268                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
269 #else
270             ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
271 #endif
272             ) {
273             // If we are in the outermost task dispatch loop of an external thread, then
274             // there is nothing to bind this context to, and we skip the binding part
275             // treating the context as isolated.
276             __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
277             if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
278                 if (!ctx.my_traits.fp_settings) {
279                     copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
280                 }
281                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
282             } else {
283                 bind_to_impl(ctx, td);
284             }
285             ITT_STACK_CREATE(ctx.my_itt_caller);
286         }
287         spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
288     }
289     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL);
290     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL);
291 }
292 
293 template <typename T>
294 void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
295     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
296     if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) {
297         // Nothing to do, whether descending from "src" or not, so no need to scan.
298         // Hopefully this happens often thanks to earlier invocations.
299         // This optimization is enabled by LIFO order in the context lists:
300         // - new contexts are bound to the beginning of lists;
301         // - descendants are newer than ancestors;
302         // - earlier invocations are therefore likely to "paint" long chains.
303     } else if (&ctx == &src) {
304         // This clause is disjunct from the traversal below, which skips src entirely.
305         // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
306         // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
307         // Letting the other thread prevail may also be fairer.
308     } else {
309         for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) {
310             if (ancestor == &src) {
311                 for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
312                     (c->*mptr_state).store(new_state, std::memory_order_relaxed);
313                 break;
314             }
315         }
316     }
317 }
318 
319 template <typename T>
320 void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
321     spin_mutex::scoped_lock lock(my_context_list_state.mutex);
322     // Acquire fence is necessary to ensure that the subsequent node->my_next load
323     // returned the correct value in case it was just inserted in another thread.
324     // The fence also ensures visibility of the correct ctx.my_parent value.
325     d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire);
326     while (node != &my_context_list_state.head) {
327         d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
328         if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
329             task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
330         node = node->next.load(std::memory_order_relaxed);
331     }
332     // Sync up local propagation epoch with the global one. Release fence prevents
333     // reordering of possible store to *mptr_state after the sync point.
334     my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
335 }
336 
337 template <typename T>
338 bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
339     if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
340         return true;
341     // The whole propagation algorithm is under the lock in order to ensure correctness
342     // in case of concurrent state changes at the different levels of the context tree.
343     // See comment at the bottom of scheduler.cpp
344     context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
345     if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
346         // Another thread has concurrently changed the state. Back down.
347         return false;
348     // Advance global state propagation epoch
349     ++the_context_state_propagation_epoch;
350     // Propagate to all workers and external threads and sync up their local epochs with the global one
351     unsigned num_workers = my_first_unused_worker_idx;
352     for (unsigned i = 0; i < num_workers; ++i) {
353         thread_data* td = my_workers[i];
354         // If the worker is only about to be registered, skip it.
355         if (td)
356             td->propagate_task_group_state(mptr_state, src, new_state);
357     }
358     // Propagate to all external threads
359     // The whole propagation sequence is locked, thus no contention is expected
360     for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
361         it->propagate_task_group_state(mptr_state, src, new_state);
362     return true;
363 }
364 
365 bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
366     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
367     __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
368     if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
369         // This task group and any descendants have already been canceled.
370         // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
371         // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
372         return false;
373     }
374     governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
375     return true;
376 }
377 
378 bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
379     return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
380 }
381 
382 // IMPORTANT: It is assumed that this method is not used concurrently!
383 void task_group_context_impl::reset(d1::task_group_context& ctx) {
384     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
385     //! TODO: Add assertion that this context does not have children
386     // No fences are necessary since this context can be accessed from another thread
387     // only after stealing happened (which means necessary fences were used).
388     if (ctx.my_exception) {
389         ctx.my_exception->destroy();
390         ctx.my_exception = NULL;
391     }
392     ctx.my_cancellation_requested = 0;
393 }
394 
395 // IMPORTANT: It is assumed that this method is not used concurrently!
396 void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
397     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
398     //! TODO: Add assertion that this context does not have children
399     // No fences are necessary since this context can be accessed from another thread
400     // only after stealing happened (which means necessary fences were used).
401     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
402     if (!ctx.my_traits.fp_settings) {
403         ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
404         ctx.my_traits.fp_settings = true;
405     }
406     ctl->get_env();
407 }
408 
409 void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
410     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
411     __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
412     __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
413 
414     const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
415     new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
416     ctx.my_traits.fp_settings = true;
417 }
418 
419 /*
420     Comments:
421 
422 1.  The premise of the cancellation support implementation is that cancellations are
423     not part of the hot path of the program execution. Therefore all changes in its
424     implementation in order to reduce the overhead of the cancellation control flow
425     should be done only in ways that do not increase overhead of the normal execution.
426 
427     In general, contexts are used by all threads and their descendants are created in
428     different threads as well. In order to minimize impact of the cross-thread tree
429     maintenance (first of all because of the synchronization), the tree of contexts
430     is split into pieces, each of which is handled by a single thread. Such pieces
431     are represented as lists of contexts, members of which are contexts that were
432     bound to their parents in the given thread.
433 
434     The context tree maintenance and cancellation propagation algorithms are designed
435     in such a manner that cross-thread access to a context list will take place only
436     when cancellation signal is sent (by user or when an exception happens), and
437     synchronization is necessary only then. Thus the normal execution flow (without
438     exceptions and cancellation) remains free from any synchronization done on
439     behalf of exception handling and cancellation support.
440 
441 2.  Consider parallel cancellations at the different levels of the context tree:
442 
443         Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
444          |                                      |
445         Ctx2                                    |- Thread1 started processing
446          |                                   T1 |- Thread2 finishes and syncs up local counters
447         Ctx3 <- Cancelled by Thread2            |
448          |                                      |- Ctx5 is bound to Ctx2
449         Ctx4                                    |
450                                              T2 |- Thread1 reaches Ctx2
451 
452     Thread-propagator of each cancellation increments global counter. However the thread
453     propagating the cancellation from the outermost context (Thread1) may be the last
454     to finish. Which means that the local counters may be synchronized earlier (by Thread2,
455     at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
456     (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
457     (Ctx2) may result in cancellation request being lost.
458 
459     This issue is solved by doing the whole propagation under the lock.
460 
461     If we need more concurrency while processing parallel cancellations, we could try
462     the following modification of the propagation algorithm:
463 
464     advance global counter and remember it
465     for each thread:
466         scan thread's list of contexts
467     for each thread:
468         sync up its local counter only if the global counter has not been changed
469 
470     However this version of the algorithm requires more analysis and verification.
471 */
472 
473 void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
474     task_group_context_impl::initialize(ctx);
475 }
476 void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
477     task_group_context_impl::destroy(ctx);
478 }
479 void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
480     task_group_context_impl::reset(ctx);
481 }
482 bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
483     return task_group_context_impl::cancel_group_execution(ctx);
484 }
485 bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
486     return task_group_context_impl::is_group_execution_cancelled(ctx);
487 }
488 void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
489     task_group_context_impl::capture_fp_settings(ctx);
490 }
491 
492 } // namespace r1
493 } // namespace detail
494 } // namespace tbb
495 
496