xref: /oneTBB/src/tbb/task_group_context.cpp (revision 49e08aac)
1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "oneapi/tbb/detail/_config.h"
18 #include "oneapi/tbb/tbb_allocator.h"
19 #include "oneapi/tbb/task_group.h"
20 #include "governor.h"
21 #include "thread_data.h"
22 #include "scheduler_common.h"
23 #include "itt_notify.h"
24 #include "task_dispatcher.h"
25 
26 #include <type_traits>
27 
28 namespace tbb {
29 namespace detail {
30 namespace r1 {
31 
32 //------------------------------------------------------------------------
33 // tbb_exception_ptr
34 //------------------------------------------------------------------------
35 tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
36     tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
37     return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
38 }
39 
40 void tbb_exception_ptr::destroy() noexcept {
41     this->~tbb_exception_ptr();
42     deallocate_memory(this);
43 }
44 
45 void tbb_exception_ptr::throw_self() {
46     if (governor::rethrow_exception_broken()) fix_broken_rethrow();
47     std::rethrow_exception(my_ptr);
48 }
49 
50 //------------------------------------------------------------------------
51 // task_group_context
52 //------------------------------------------------------------------------
53 
54 void task_group_context_impl::destroy(d1::task_group_context& ctx) {
55     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
56     if (ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::bound) {
57         // The owner can be destroyed at any moment. Access the associate data with caution.
58         thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed);
59         if (governor::is_thread_data_set(owner)) {
60             thread_data::context_list_state& cls = owner->my_context_list_state;
61             // We are the owner, so cls is valid.
62             // Local update of the context list
63             std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_relaxed);
64             // The sequentially-consistent store to prevent load of nonlocal update flag
65             // from being hoisted before the store to local update flag.
66             cls.local_update = 1;
67             if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
68                 spin_mutex::scoped_lock lock(cls.mutex);
69                 ctx.my_node.remove_relaxed();
70                 cls.local_update.store(0, std::memory_order_relaxed);
71             } else {
72                 ctx.my_node.remove_relaxed();
73                 // Release fence is necessary so that update of our neighbors in
74                 // the context list was committed when possible concurrent destroyer
75                 // proceeds after local update flag is reset by the following store.
76                 cls.local_update.store(0, std::memory_order_release);
77                 if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
78                     // Another thread was propagating cancellation request when we removed
79                     // ourselves from the list. We must ensure that it is not accessing us
80                     // when this destructor finishes. We'll be able to acquire the lock
81                     // below only after the other thread finishes with us.
82                     spin_mutex::scoped_lock lock(cls.mutex);
83                 }
84             }
85         } else {
86             d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound;
87             if (
88 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
89                 !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
90                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected,
91                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
92 #else
93                 !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked)
94 #endif
95                 ) {
96                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr);
97                 // The "owner" local variable can be a dangling pointer here. Do not access it.
98                 owner = nullptr;
99                 // It is unsafe to remove the node because its neighbors might be already destroyed.
100                 // TODO: reconsider the logic.
101                 // ctx.my_node.remove_relaxed();
102             } else {
103                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr);
104                 __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr);
105                 thread_data::context_list_state& cls = owner->my_context_list_state;
106                 __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive.");
107 
108                 ++cls.nonlocal_update;
109                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release);
110                 spin_wait_until_eq(cls.local_update, 0u);
111                 {
112                     spin_mutex::scoped_lock lock(cls.mutex);
113                     ctx.my_node.remove_relaxed();
114                 }
115                 --cls.nonlocal_update;
116             }
117         }
118     }
119     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
120 #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
121     suppress_unused_warning(ctl);
122 #endif
123     ctl->~cpu_ctl_env();
124 
125     if (ctx.my_exception)
126         ctx.my_exception->destroy();
127     ITT_STACK_DESTROY(ctx.my_itt_caller);
128 
129     poison_pointer(ctx.my_parent);
130     poison_pointer(ctx.my_parent);
131     poison_pointer(ctx.my_owner);
132     poison_pointer(ctx.my_node.next);
133     poison_pointer(ctx.my_node.prev);
134     poison_pointer(ctx.my_exception);
135     poison_pointer(ctx.my_itt_caller);
136 }
137 
138 void task_group_context_impl::initialize(d1::task_group_context& ctx) {
139     ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
140 
141     ctx.my_cpu_ctl_env = 0;
142     ctx.my_cancellation_requested = 0;
143     ctx.my_state.store(0, std::memory_order_relaxed);
144     // Set the created state to bound at the first usage.
145     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
146     ctx.my_parent = nullptr;
147     ctx.my_owner = nullptr;
148     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
149     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
150     ctx.my_exception = nullptr;
151     ctx.my_itt_caller = nullptr;
152 
153     static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
154     d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
155     if (ctx.my_traits.fp_settings)
156         ctl->get_env();
157 }
158 
159 void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
160     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
161     __TBB_ASSERT(td, NULL);
162     ctx.my_owner.store(td, std::memory_order_relaxed);
163     thread_data::context_list_state& cls = td->my_context_list_state;
164     // state propagation logic assumes new contexts are bound to head of the list
165     ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed);
166     // Notify threads that may be concurrently destroying contexts registered
167     // in this scheduler's list that local list update is underway.
168     // Prevent load of global propagation epoch counter from being hoisted before
169     // speculative stores above, as well as load of nonlocal update flag from
170     // being hoisted before the store to local update flag.
171     cls.local_update = 1;
172     // Finalize local context list update
173     if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
174         spin_mutex::scoped_lock lock(cls.mutex);
175         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
176         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
177         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
178         cls.local_update.store(0, std::memory_order_relaxed);
179         cls.head.next.store(&ctx.my_node, std::memory_order_relaxed);
180     } else {
181         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
182         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
183         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
184         cls.local_update.store(0, std::memory_order_release);
185         // Thread-local list of contexts allows concurrent traversal by another thread
186         // while propagating state change. To ensure visibility of ctx.my_node's members
187         // to the concurrently traversing thread, the list's head is updated by means
188         // of store-with-release.
189         cls.head.next.store(&ctx.my_node, std::memory_order_release);
190     }
191 }
192 
193 void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
194     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
195     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
196     __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
197 
198     ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
199     __TBB_ASSERT(ctx.my_parent, NULL);
200 
201     // Inherit FPU settings only if the context has not captured FPU settings yet.
202     if (!ctx.my_traits.fp_settings)
203         copy_fp_settings(ctx, *ctx.my_parent);
204 
205     // Condition below prevents unnecessary thrashing parent context's cache line
206     if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
207         ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
208     }
209     if (ctx.my_parent->my_parent) {
210         // Even if this context were made accessible for state change propagation
211         // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
212         // above), it still could be missed if state propagation from a grand-ancestor
213         // was underway concurrently with binding.
214         // Speculative propagation from the parent together with epoch counters
215         // detecting possibility of such a race allow to avoid taking locks when
216         // there is no contention.
217 
218         // Acquire fence is necessary to prevent reordering subsequent speculative
219         // loads of parent state data out of the scope where epoch counters comparison
220         // can reliably validate it.
221         uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire);
222         // Speculative propagation of parent's state. The speculation will be
223         // validated by the epoch counters check further on.
224         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
225         register_with(ctx, td); // Issues full fence
226 
227         // If no state propagation was detected by the following condition, the above
228         // full fence guarantees that the parent had correct state during speculative
229         // propagation before the fence. Otherwise the propagation from parent is
230         // repeated under the lock.
231         if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
232             // Another thread may be propagating state change right now. So resort to lock.
233             context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
234             ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
235         }
236     } else {
237         register_with(ctx, td); // Issues full fence
238         // As we do not have grand-ancestors, concurrent state propagation (if any)
239         // may originate only from the parent context, and thus it is safe to directly
240         // copy the state from it.
241         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
242     }
243 
244     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
245 }
246 
247 void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
248     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
249     d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
250     if (state <= d1::task_group_context::lifetime_state::locked) {
251         if (state == d1::task_group_context::lifetime_state::created &&
252 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
253             ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
254             (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
255                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
256 #else
257             ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
258 #endif
259             ) {
260             // If we are in the outermost task dispatch loop of a master thread, then
261             // there is nothing to bind this context to, and we skip the binding part
262             // treating the context as isolated.
263             __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
264             if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
265                 if (!ctx.my_traits.fp_settings) {
266                     copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
267                 }
268                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
269             } else {
270                 bind_to_impl(ctx, td);
271             }
272             ITT_STACK_CREATE(ctx.my_itt_caller);
273         }
274         spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
275     }
276     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL);
277     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL);
278 }
279 
280 template <typename T>
281 void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
282     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
283     if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) {
284         // Nothing to do, whether descending from "src" or not, so no need to scan.
285         // Hopefully this happens often thanks to earlier invocations.
286         // This optimization is enabled by LIFO order in the context lists:
287         // - new contexts are bound to the beginning of lists;
288         // - descendants are newer than ancestors;
289         // - earlier invocations are therefore likely to "paint" long chains.
290     } else if (&ctx == &src) {
291         // This clause is disjunct from the traversal below, which skips src entirely.
292         // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
293         // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
294         // Letting the other thread prevail may also be fairer.
295     } else {
296         for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) {
297             if (ancestor == &src) {
298                 for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
299                     (c->*mptr_state).store(new_state, std::memory_order_relaxed);
300                 break;
301             }
302         }
303     }
304 }
305 
306 bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
307     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
308     __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
309     if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
310         // This task group and any descendants have already been canceled.
311         // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
312         // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
313         return false;
314     }
315     governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
316     return true;
317 }
318 
319 bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
320     return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
321 }
322 
323 // IMPORTANT: It is assumed that this method is not used concurrently!
324 void task_group_context_impl::reset(d1::task_group_context& ctx) {
325     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
326     //! TODO: Add assertion that this context does not have children
327     // No fences are necessary since this context can be accessed from another thread
328     // only after stealing happened (which means necessary fences were used).
329     if (ctx.my_exception) {
330         ctx.my_exception->destroy();
331         ctx.my_exception = NULL;
332     }
333     ctx.my_cancellation_requested = 0;
334 }
335 
336 // IMPORTANT: It is assumed that this method is not used concurrently!
337 void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
338     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
339     //! TODO: Add assertion that this context does not have children
340     // No fences are necessary since this context can be accessed from another thread
341     // only after stealing happened (which means necessary fences were used).
342     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
343     if (!ctx.my_traits.fp_settings) {
344         ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
345         ctx.my_traits.fp_settings = true;
346     }
347     ctl->get_env();
348 }
349 
350 void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
351     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
352     __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
353     __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
354 
355     const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
356     new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
357     ctx.my_traits.fp_settings = true;
358 }
359 
360 template <typename T>
361 void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
362     spin_mutex::scoped_lock lock(my_context_list_state.mutex);
363     // Acquire fence is necessary to ensure that the subsequent node->my_next load
364     // returned the correct value in case it was just inserted in another thread.
365     // The fence also ensures visibility of the correct ctx.my_parent value.
366     d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire);
367     while (node != &my_context_list_state.head) {
368         d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
369         if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
370             task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
371         node = node->next.load(std::memory_order_relaxed);
372     }
373     // Sync up local propagation epoch with the global one. Release fence prevents
374     // reordering of possible store to *mptr_state after the sync point.
375     my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
376 }
377 
378 template <typename T>
379 bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
380     if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
381         return true;
382     // The whole propagation algorithm is under the lock in order to ensure correctness
383     // in case of concurrent state changes at the different levels of the context tree.
384     // See comment at the bottom of scheduler.cpp
385     context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
386     if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
387         // Another thread has concurrently changed the state. Back down.
388         return false;
389     // Advance global state propagation epoch
390     ++the_context_state_propagation_epoch;
391     // Propagate to all workers and masters and sync up their local epochs with the global one
392     unsigned num_workers = my_first_unused_worker_idx;
393     for (unsigned i = 0; i < num_workers; ++i) {
394         thread_data* td = my_workers[i];
395         // If the worker is only about to be registered, skip it.
396         if (td)
397             td->propagate_task_group_state(mptr_state, src, new_state);
398     }
399     // Propagate to all master threads
400     // The whole propagation sequence is locked, thus no contention is expected
401     for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
402         it->propagate_task_group_state(mptr_state, src, new_state);
403     return true;
404 }
405 
406 /*
407     Comments:
408 
409 1.  The premise of the cancellation support implementation is that cancellations are
410     not part of the hot path of the program execution. Therefore all changes in its
411     implementation in order to reduce the overhead of the cancellation control flow
412     should be done only in ways that do not increase overhead of the normal execution.
413 
414     In general, contexts are used by all threads and their descendants are created in
415     different threads as well. In order to minimize impact of the cross-thread tree
416     maintenance (first of all because of the synchronization), the tree of contexts
417     is split into pieces, each of which is handled by a single thread. Such pieces
418     are represented as lists of contexts, members of which are contexts that were
419     bound to their parents in the given thread.
420 
421     The context tree maintenance and cancellation propagation algorithms are designed
422     in such a manner that cross-thread access to a context list will take place only
423     when cancellation signal is sent (by user or when an exception happens), and
424     synchronization is necessary only then. Thus the normal execution flow (without
425     exceptions and cancellation) remains free from any synchronization done on
426     behalf of exception handling and cancellation support.
427 
428 2.  Consider parallel cancellations at the different levels of the context tree:
429 
430         Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
431          |                                      |
432         Ctx2                                    |- Thread1 started processing
433          |                                   T1 |- Thread2 finishes and syncs up local counters
434         Ctx3 <- Cancelled by Thread2            |
435          |                                      |- Ctx5 is bound to Ctx2
436         Ctx4                                    |
437                                              T2 |- Thread1 reaches Ctx2
438 
439     Thread-propagator of each cancellation increments global counter. However the thread
440     propagating the cancellation from the outermost context (Thread1) may be the last
441     to finish. Which means that the local counters may be synchronized earlier (by Thread2,
442     at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
443     (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
444     (Ctx2) may result in cancellation request being lost.
445 
446     This issue is solved by doing the whole propagation under the lock.
447 
448     If we need more concurrency while processing parallel cancellations, we could try
449     the following modification of the propagation algorithm:
450 
451     advance global counter and remember it
452     for each thread:
453         scan thread's list of contexts
454     for each thread:
455         sync up its local counter only if the global counter has not been changed
456 
457     However this version of the algorithm requires more analysis and verification.
458 */
459 
460 void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
461     task_group_context_impl::initialize(ctx);
462 }
463 void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
464     task_group_context_impl::destroy(ctx);
465 }
466 void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
467     task_group_context_impl::reset(ctx);
468 }
469 bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
470     return task_group_context_impl::cancel_group_execution(ctx);
471 }
472 bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
473     return task_group_context_impl::is_group_execution_cancelled(ctx);
474 }
475 void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
476     task_group_context_impl::capture_fp_settings(ctx);
477 }
478 
479 } // namespace r1
480 } // namespace detail
481 } // namespace tbb
482 
483