xref: /oneTBB/src/tbb/task_group_context.cpp (revision b15aabb3)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "oneapi/tbb/detail/_config.h"
18 #include "oneapi/tbb/tbb_allocator.h"
19 #include "oneapi/tbb/task_group.h"
20 #include "governor.h"
21 #include "thread_data.h"
22 #include "scheduler_common.h"
23 #include "itt_notify.h"
24 #include "task_dispatcher.h"
25 
26 #include <type_traits>
27 
28 namespace tbb {
29 namespace detail {
30 namespace r1 {
31 
32 //------------------------------------------------------------------------
33 // tbb_exception_ptr
34 //------------------------------------------------------------------------
35 tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
36     tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
37     return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
38 }
39 
40 void tbb_exception_ptr::destroy() noexcept {
41     this->~tbb_exception_ptr();
42     deallocate_memory(this);
43 }
44 
45 void tbb_exception_ptr::throw_self() {
46     if (governor::rethrow_exception_broken()) fix_broken_rethrow();
47     std::rethrow_exception(my_ptr);
48 }
49 
50 //------------------------------------------------------------------------
51 // task_group_context
52 //------------------------------------------------------------------------
53 
54 void task_group_context_impl::destroy(d1::task_group_context& ctx) {
55     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
56 
57     auto ctx_lifetime_state = ctx.my_lifetime_state.load(std::memory_order_relaxed);
58     __TBB_ASSERT(ctx_lifetime_state != d1::task_group_context::lifetime_state::locked, nullptr);
59 
60     if (ctx_lifetime_state == d1::task_group_context::lifetime_state::bound) {
61         // The owner can be destroyed at any moment. Access the associate data with caution.
62         thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed);
63         if (governor::is_thread_data_set(owner)) {
64             thread_data::context_list_state& cls = owner->my_context_list_state;
65             // We are the owner, so cls is valid.
66             // Local update of the context list
67             std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_relaxed);
68             // The sequentially-consistent store to prevent load of nonlocal update flag
69             // from being hoisted before the store to local update flag.
70             cls.local_update = 1;
71             if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
72                 spin_mutex::scoped_lock lock(cls.mutex);
73                 ctx.my_node.remove_relaxed();
74                 cls.local_update.store(0, std::memory_order_relaxed);
75             } else {
76                 ctx.my_node.remove_relaxed();
77                 // Release fence is necessary so that update of our neighbors in
78                 // the context list was committed when possible concurrent destroyer
79                 // proceeds after local update flag is reset by the following store.
80                 cls.local_update.store(0, std::memory_order_release);
81                 if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
82                     // Another thread was propagating cancellation request when we removed
83                     // ourselves from the list. We must ensure that it is not accessing us
84                     // when this destructor finishes. We'll be able to acquire the lock
85                     // below only after the other thread finishes with us.
86                     spin_mutex::scoped_lock lock(cls.mutex);
87                 }
88             }
89         } else {
90             d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound;
91             if (
92 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
93                 !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
94                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected,
95                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
96 #else
97                 !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked)
98 #endif
99                 ) {
100                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr);
101                 // The "owner" local variable can be a dangling pointer here. Do not access it.
102                 owner = nullptr;
103                 spin_wait_until_eq(ctx.my_owner, nullptr);
104                 // It is unsafe to remove the node because its neighbors might be already destroyed.
105                 // TODO: reconsider the logic.
106                 // ctx.my_node.remove_relaxed();
107             }
108             else {
109                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr);
110                 __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr);
111                 thread_data::context_list_state& cls = owner->my_context_list_state;
112                 __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive.");
113 
114                 ++cls.nonlocal_update;
115                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release);
116                 spin_wait_until_eq(cls.local_update, 0u);
117                 {
118                     spin_mutex::scoped_lock lock(cls.mutex);
119                     ctx.my_node.remove_relaxed();
120                 }
121                 --cls.nonlocal_update;
122             }
123         }
124     }
125 
126     if (ctx_lifetime_state == d1::task_group_context::lifetime_state::detached) {
127         spin_wait_until_eq(ctx.my_owner, nullptr);
128     }
129 
130     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
131 #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
132     suppress_unused_warning(ctl);
133 #endif
134     ctl->~cpu_ctl_env();
135 
136     if (ctx.my_exception)
137         ctx.my_exception->destroy();
138     ITT_STACK_DESTROY(ctx.my_itt_caller);
139 
140     poison_pointer(ctx.my_parent);
141     poison_pointer(ctx.my_parent);
142     poison_pointer(ctx.my_owner);
143     poison_pointer(ctx.my_node.next);
144     poison_pointer(ctx.my_node.prev);
145     poison_pointer(ctx.my_exception);
146     poison_pointer(ctx.my_itt_caller);
147 }
148 
149 void task_group_context_impl::initialize(d1::task_group_context& ctx) {
150     ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
151 
152     ctx.my_cpu_ctl_env = 0;
153     ctx.my_cancellation_requested = 0;
154     ctx.my_state.store(0, std::memory_order_relaxed);
155     // Set the created state to bound at the first usage.
156     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
157     ctx.my_parent = nullptr;
158     ctx.my_owner = nullptr;
159     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
160     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
161     ctx.my_exception = nullptr;
162     ctx.my_itt_caller = nullptr;
163 
164     static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
165     d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
166     if (ctx.my_traits.fp_settings)
167         ctl->get_env();
168 }
169 
170 void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
171     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
172     __TBB_ASSERT(td, NULL);
173     ctx.my_owner.store(td, std::memory_order_relaxed);
174     thread_data::context_list_state& cls = td->my_context_list_state;
175     // state propagation logic assumes new contexts are bound to head of the list
176     ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed);
177     // Notify threads that may be concurrently destroying contexts registered
178     // in this scheduler's list that local list update is underway.
179     // Prevent load of global propagation epoch counter from being hoisted before
180     // speculative stores above, as well as load of nonlocal update flag from
181     // being hoisted before the store to local update flag.
182     cls.local_update = 1;
183     // Finalize local context list update
184     if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
185         spin_mutex::scoped_lock lock(cls.mutex);
186         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
187         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
188         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
189         cls.local_update.store(0, std::memory_order_relaxed);
190         cls.head.next.store(&ctx.my_node, std::memory_order_relaxed);
191     } else {
192         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
193         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
194         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
195         cls.local_update.store(0, std::memory_order_release);
196         // Thread-local list of contexts allows concurrent traversal by another thread
197         // while propagating state change. To ensure visibility of ctx.my_node's members
198         // to the concurrently traversing thread, the list's head is updated by means
199         // of store-with-release.
200         cls.head.next.store(&ctx.my_node, std::memory_order_release);
201     }
202 }
203 
204 void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
205     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
206     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
207     __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
208 
209     ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
210     __TBB_ASSERT(ctx.my_parent, NULL);
211 
212     // Inherit FPU settings only if the context has not captured FPU settings yet.
213     if (!ctx.my_traits.fp_settings)
214         copy_fp_settings(ctx, *ctx.my_parent);
215 
216     // Condition below prevents unnecessary thrashing parent context's cache line
217     if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
218         ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
219     }
220     if (ctx.my_parent->my_parent) {
221         // Even if this context were made accessible for state change propagation
222         // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
223         // above), it still could be missed if state propagation from a grand-ancestor
224         // was underway concurrently with binding.
225         // Speculative propagation from the parent together with epoch counters
226         // detecting possibility of such a race allow to avoid taking locks when
227         // there is no contention.
228 
229         // Acquire fence is necessary to prevent reordering subsequent speculative
230         // loads of parent state data out of the scope where epoch counters comparison
231         // can reliably validate it.
232         uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire);
233         // Speculative propagation of parent's state. The speculation will be
234         // validated by the epoch counters check further on.
235         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
236         register_with(ctx, td); // Issues full fence
237 
238         // If no state propagation was detected by the following condition, the above
239         // full fence guarantees that the parent had correct state during speculative
240         // propagation before the fence. Otherwise the propagation from parent is
241         // repeated under the lock.
242         if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
243             // Another thread may be propagating state change right now. So resort to lock.
244             context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
245             ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
246         }
247     } else {
248         register_with(ctx, td); // Issues full fence
249         // As we do not have grand-ancestors, concurrent state propagation (if any)
250         // may originate only from the parent context, and thus it is safe to directly
251         // copy the state from it.
252         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
253     }
254 
255     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
256 }
257 
258 void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
259     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
260     d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
261     if (state <= d1::task_group_context::lifetime_state::locked) {
262         if (state == d1::task_group_context::lifetime_state::created &&
263 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
264             ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
265             (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
266                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
267 #else
268             ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
269 #endif
270             ) {
271             // If we are in the outermost task dispatch loop of an external thread, then
272             // there is nothing to bind this context to, and we skip the binding part
273             // treating the context as isolated.
274             __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
275             if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
276                 if (!ctx.my_traits.fp_settings) {
277                     copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
278                 }
279                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
280             } else {
281                 bind_to_impl(ctx, td);
282             }
283             ITT_STACK_CREATE(ctx.my_itt_caller);
284         }
285         spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
286     }
287     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL);
288     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL);
289 }
290 
291 template <typename T>
292 void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
293     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
294     if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) {
295         // Nothing to do, whether descending from "src" or not, so no need to scan.
296         // Hopefully this happens often thanks to earlier invocations.
297         // This optimization is enabled by LIFO order in the context lists:
298         // - new contexts are bound to the beginning of lists;
299         // - descendants are newer than ancestors;
300         // - earlier invocations are therefore likely to "paint" long chains.
301     } else if (&ctx == &src) {
302         // This clause is disjunct from the traversal below, which skips src entirely.
303         // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
304         // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
305         // Letting the other thread prevail may also be fairer.
306     } else {
307         for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) {
308             if (ancestor == &src) {
309                 for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
310                     (c->*mptr_state).store(new_state, std::memory_order_relaxed);
311                 break;
312             }
313         }
314     }
315 }
316 
317 bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
318     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
319     __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
320     if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
321         // This task group and any descendants have already been canceled.
322         // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
323         // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
324         return false;
325     }
326     governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
327     return true;
328 }
329 
330 bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
331     return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
332 }
333 
334 // IMPORTANT: It is assumed that this method is not used concurrently!
335 void task_group_context_impl::reset(d1::task_group_context& ctx) {
336     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
337     //! TODO: Add assertion that this context does not have children
338     // No fences are necessary since this context can be accessed from another thread
339     // only after stealing happened (which means necessary fences were used).
340     if (ctx.my_exception) {
341         ctx.my_exception->destroy();
342         ctx.my_exception = NULL;
343     }
344     ctx.my_cancellation_requested = 0;
345 }
346 
347 // IMPORTANT: It is assumed that this method is not used concurrently!
348 void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
349     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
350     //! TODO: Add assertion that this context does not have children
351     // No fences are necessary since this context can be accessed from another thread
352     // only after stealing happened (which means necessary fences were used).
353     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
354     if (!ctx.my_traits.fp_settings) {
355         ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
356         ctx.my_traits.fp_settings = true;
357     }
358     ctl->get_env();
359 }
360 
361 void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
362     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
363     __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
364     __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
365 
366     const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
367     new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
368     ctx.my_traits.fp_settings = true;
369 }
370 
371 template <typename T>
372 void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
373     spin_mutex::scoped_lock lock(my_context_list_state.mutex);
374     // Acquire fence is necessary to ensure that the subsequent node->my_next load
375     // returned the correct value in case it was just inserted in another thread.
376     // The fence also ensures visibility of the correct ctx.my_parent value.
377     d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire);
378     while (node != &my_context_list_state.head) {
379         d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
380         if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
381             task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
382         node = node->next.load(std::memory_order_relaxed);
383     }
384     // Sync up local propagation epoch with the global one. Release fence prevents
385     // reordering of possible store to *mptr_state after the sync point.
386     my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
387 }
388 
389 template <typename T>
390 bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
391     if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
392         return true;
393     // The whole propagation algorithm is under the lock in order to ensure correctness
394     // in case of concurrent state changes at the different levels of the context tree.
395     // See comment at the bottom of scheduler.cpp
396     context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
397     if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
398         // Another thread has concurrently changed the state. Back down.
399         return false;
400     // Advance global state propagation epoch
401     ++the_context_state_propagation_epoch;
402     // Propagate to all workers and external threads and sync up their local epochs with the global one
403     unsigned num_workers = my_first_unused_worker_idx;
404     for (unsigned i = 0; i < num_workers; ++i) {
405         thread_data* td = my_workers[i];
406         // If the worker is only about to be registered, skip it.
407         if (td)
408             td->propagate_task_group_state(mptr_state, src, new_state);
409     }
410     // Propagate to all external threads
411     // The whole propagation sequence is locked, thus no contention is expected
412     for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
413         it->propagate_task_group_state(mptr_state, src, new_state);
414     return true;
415 }
416 
417 /*
418     Comments:
419 
420 1.  The premise of the cancellation support implementation is that cancellations are
421     not part of the hot path of the program execution. Therefore all changes in its
422     implementation in order to reduce the overhead of the cancellation control flow
423     should be done only in ways that do not increase overhead of the normal execution.
424 
425     In general, contexts are used by all threads and their descendants are created in
426     different threads as well. In order to minimize impact of the cross-thread tree
427     maintenance (first of all because of the synchronization), the tree of contexts
428     is split into pieces, each of which is handled by a single thread. Such pieces
429     are represented as lists of contexts, members of which are contexts that were
430     bound to their parents in the given thread.
431 
432     The context tree maintenance and cancellation propagation algorithms are designed
433     in such a manner that cross-thread access to a context list will take place only
434     when cancellation signal is sent (by user or when an exception happens), and
435     synchronization is necessary only then. Thus the normal execution flow (without
436     exceptions and cancellation) remains free from any synchronization done on
437     behalf of exception handling and cancellation support.
438 
439 2.  Consider parallel cancellations at the different levels of the context tree:
440 
441         Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
442          |                                      |
443         Ctx2                                    |- Thread1 started processing
444          |                                   T1 |- Thread2 finishes and syncs up local counters
445         Ctx3 <- Cancelled by Thread2            |
446          |                                      |- Ctx5 is bound to Ctx2
447         Ctx4                                    |
448                                              T2 |- Thread1 reaches Ctx2
449 
450     Thread-propagator of each cancellation increments global counter. However the thread
451     propagating the cancellation from the outermost context (Thread1) may be the last
452     to finish. Which means that the local counters may be synchronized earlier (by Thread2,
453     at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
454     (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
455     (Ctx2) may result in cancellation request being lost.
456 
457     This issue is solved by doing the whole propagation under the lock.
458 
459     If we need more concurrency while processing parallel cancellations, we could try
460     the following modification of the propagation algorithm:
461 
462     advance global counter and remember it
463     for each thread:
464         scan thread's list of contexts
465     for each thread:
466         sync up its local counter only if the global counter has not been changed
467 
468     However this version of the algorithm requires more analysis and verification.
469 */
470 
471 void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
472     task_group_context_impl::initialize(ctx);
473 }
474 void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
475     task_group_context_impl::destroy(ctx);
476 }
477 void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
478     task_group_context_impl::reset(ctx);
479 }
480 bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
481     return task_group_context_impl::cancel_group_execution(ctx);
482 }
483 bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
484     return task_group_context_impl::is_group_execution_cancelled(ctx);
485 }
486 void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
487     task_group_context_impl::capture_fp_settings(ctx);
488 }
489 
490 } // namespace r1
491 } // namespace detail
492 } // namespace tbb
493 
494