xref: /oneTBB/src/tbb/task_group_context.cpp (revision 35147e00)
151c0b2f7Stbbdev /*
2b15aabb3Stbbdev     Copyright (c) 2005-2021 Intel Corporation
351c0b2f7Stbbdev 
451c0b2f7Stbbdev     Licensed under the Apache License, Version 2.0 (the "License");
551c0b2f7Stbbdev     you may not use this file except in compliance with the License.
651c0b2f7Stbbdev     You may obtain a copy of the License at
751c0b2f7Stbbdev 
851c0b2f7Stbbdev         http://www.apache.org/licenses/LICENSE-2.0
951c0b2f7Stbbdev 
1051c0b2f7Stbbdev     Unless required by applicable law or agreed to in writing, software
1151c0b2f7Stbbdev     distributed under the License is distributed on an "AS IS" BASIS,
1251c0b2f7Stbbdev     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1351c0b2f7Stbbdev     See the License for the specific language governing permissions and
1451c0b2f7Stbbdev     limitations under the License.
1551c0b2f7Stbbdev */
1651c0b2f7Stbbdev 
1749e08aacStbbdev #include "oneapi/tbb/detail/_config.h"
1849e08aacStbbdev #include "oneapi/tbb/tbb_allocator.h"
1949e08aacStbbdev #include "oneapi/tbb/task_group.h"
2051c0b2f7Stbbdev #include "governor.h"
2151c0b2f7Stbbdev #include "thread_data.h"
2251c0b2f7Stbbdev #include "scheduler_common.h"
2351c0b2f7Stbbdev #include "itt_notify.h"
2451c0b2f7Stbbdev #include "task_dispatcher.h"
2551c0b2f7Stbbdev 
2651c0b2f7Stbbdev #include <type_traits>
2751c0b2f7Stbbdev 
2851c0b2f7Stbbdev namespace tbb {
2951c0b2f7Stbbdev namespace detail {
3051c0b2f7Stbbdev namespace r1 {
3151c0b2f7Stbbdev 
3251c0b2f7Stbbdev //------------------------------------------------------------------------
3351c0b2f7Stbbdev // tbb_exception_ptr
3451c0b2f7Stbbdev //------------------------------------------------------------------------
3551c0b2f7Stbbdev tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
3651c0b2f7Stbbdev     tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
3751c0b2f7Stbbdev     return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
3851c0b2f7Stbbdev }
3951c0b2f7Stbbdev 
4051c0b2f7Stbbdev void tbb_exception_ptr::destroy() noexcept {
4151c0b2f7Stbbdev     this->~tbb_exception_ptr();
4251c0b2f7Stbbdev     deallocate_memory(this);
4351c0b2f7Stbbdev }
4451c0b2f7Stbbdev 
4551c0b2f7Stbbdev void tbb_exception_ptr::throw_self() {
4651c0b2f7Stbbdev     if (governor::rethrow_exception_broken()) fix_broken_rethrow();
4751c0b2f7Stbbdev     std::rethrow_exception(my_ptr);
4851c0b2f7Stbbdev }
4951c0b2f7Stbbdev 
5051c0b2f7Stbbdev //------------------------------------------------------------------------
5151c0b2f7Stbbdev // task_group_context
5251c0b2f7Stbbdev //------------------------------------------------------------------------
5351c0b2f7Stbbdev 
5451c0b2f7Stbbdev void task_group_context_impl::destroy(d1::task_group_context& ctx) {
55*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
56d86ed7fbStbbdev 
57*35147e00SIlya Isaev     if (ctx.my_context_list != nullptr) {
58*35147e00SIlya Isaev         __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::bound, nullptr);
5951c0b2f7Stbbdev         // The owner can be destroyed at any moment. Access the associate data with caution.
60*35147e00SIlya Isaev         ctx.my_context_list->remove(ctx.my_node);
6151c0b2f7Stbbdev     }
6251c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
6351c0b2f7Stbbdev #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
6451c0b2f7Stbbdev     suppress_unused_warning(ctl);
6551c0b2f7Stbbdev #endif
6651c0b2f7Stbbdev     ctl->~cpu_ctl_env();
6751c0b2f7Stbbdev 
6851c0b2f7Stbbdev     if (ctx.my_exception)
6951c0b2f7Stbbdev         ctx.my_exception->destroy();
7051c0b2f7Stbbdev     ITT_STACK_DESTROY(ctx.my_itt_caller);
7151c0b2f7Stbbdev 
7251c0b2f7Stbbdev     poison_pointer(ctx.my_parent);
73*35147e00SIlya Isaev     poison_pointer(ctx.my_context_list);
74*35147e00SIlya Isaev     poison_pointer(ctx.my_node.my_next_node);
75*35147e00SIlya Isaev     poison_pointer(ctx.my_node.my_prev_node);
7651c0b2f7Stbbdev     poison_pointer(ctx.my_exception);
7751c0b2f7Stbbdev     poison_pointer(ctx.my_itt_caller);
78*35147e00SIlya Isaev 
79*35147e00SIlya Isaev     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dead, std::memory_order_release);
8051c0b2f7Stbbdev }
8151c0b2f7Stbbdev 
8251c0b2f7Stbbdev void task_group_context_impl::initialize(d1::task_group_context& ctx) {
8351c0b2f7Stbbdev     ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
8451c0b2f7Stbbdev 
85*35147e00SIlya Isaev     ctx.my_node.my_next_node = &ctx.my_node;
86*35147e00SIlya Isaev     ctx.my_node.my_prev_node = &ctx.my_node;
8751c0b2f7Stbbdev     ctx.my_cpu_ctl_env = 0;
8851c0b2f7Stbbdev     ctx.my_cancellation_requested = 0;
8951c0b2f7Stbbdev     ctx.my_state.store(0, std::memory_order_relaxed);
9051c0b2f7Stbbdev     // Set the created state to bound at the first usage.
9151c0b2f7Stbbdev     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
9251c0b2f7Stbbdev     ctx.my_parent = nullptr;
93*35147e00SIlya Isaev     ctx.my_context_list = nullptr;
9451c0b2f7Stbbdev     ctx.my_exception = nullptr;
9551c0b2f7Stbbdev     ctx.my_itt_caller = nullptr;
9651c0b2f7Stbbdev 
9751c0b2f7Stbbdev     static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
9851c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
9951c0b2f7Stbbdev     if (ctx.my_traits.fp_settings)
10051c0b2f7Stbbdev         ctl->get_env();
10151c0b2f7Stbbdev }
10251c0b2f7Stbbdev 
10351c0b2f7Stbbdev void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
104*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
105*35147e00SIlya Isaev     __TBB_ASSERT(td, nullptr);
106*35147e00SIlya Isaev     ctx.my_context_list = td->my_context_list;
107*35147e00SIlya Isaev 
108*35147e00SIlya Isaev     ctx.my_context_list->push_front(ctx.my_node);
10951c0b2f7Stbbdev }
11051c0b2f7Stbbdev 
11151c0b2f7Stbbdev void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
112*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
11351c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
11451c0b2f7Stbbdev     __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
11551c0b2f7Stbbdev 
11651c0b2f7Stbbdev     ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
117*35147e00SIlya Isaev     __TBB_ASSERT(ctx.my_parent, nullptr);
11851c0b2f7Stbbdev 
11951c0b2f7Stbbdev     // Inherit FPU settings only if the context has not captured FPU settings yet.
12051c0b2f7Stbbdev     if (!ctx.my_traits.fp_settings)
12151c0b2f7Stbbdev         copy_fp_settings(ctx, *ctx.my_parent);
12251c0b2f7Stbbdev 
12351c0b2f7Stbbdev     // Condition below prevents unnecessary thrashing parent context's cache line
12451c0b2f7Stbbdev     if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
12551c0b2f7Stbbdev         ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
12651c0b2f7Stbbdev     }
12751c0b2f7Stbbdev     if (ctx.my_parent->my_parent) {
12851c0b2f7Stbbdev         // Even if this context were made accessible for state change propagation
12951c0b2f7Stbbdev         // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
13051c0b2f7Stbbdev         // above), it still could be missed if state propagation from a grand-ancestor
13151c0b2f7Stbbdev         // was underway concurrently with binding.
13251c0b2f7Stbbdev         // Speculative propagation from the parent together with epoch counters
13351c0b2f7Stbbdev         // detecting possibility of such a race allow to avoid taking locks when
13451c0b2f7Stbbdev         // there is no contention.
13551c0b2f7Stbbdev 
13651c0b2f7Stbbdev         // Acquire fence is necessary to prevent reordering subsequent speculative
13751c0b2f7Stbbdev         // loads of parent state data out of the scope where epoch counters comparison
13851c0b2f7Stbbdev         // can reliably validate it.
139*35147e00SIlya Isaev         uintptr_t local_count_snapshot = ctx.my_parent->my_context_list->epoch.load(std::memory_order_acquire);
14051c0b2f7Stbbdev         // Speculative propagation of parent's state. The speculation will be
14151c0b2f7Stbbdev         // validated by the epoch counters check further on.
14251c0b2f7Stbbdev         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
14351c0b2f7Stbbdev         register_with(ctx, td); // Issues full fence
14451c0b2f7Stbbdev 
14551c0b2f7Stbbdev         // If no state propagation was detected by the following condition, the above
14651c0b2f7Stbbdev         // full fence guarantees that the parent had correct state during speculative
14751c0b2f7Stbbdev         // propagation before the fence. Otherwise the propagation from parent is
14851c0b2f7Stbbdev         // repeated under the lock.
14951c0b2f7Stbbdev         if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
15051c0b2f7Stbbdev             // Another thread may be propagating state change right now. So resort to lock.
15151c0b2f7Stbbdev             context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
15251c0b2f7Stbbdev             ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
15351c0b2f7Stbbdev         }
15451c0b2f7Stbbdev     } else {
15551c0b2f7Stbbdev         register_with(ctx, td); // Issues full fence
15651c0b2f7Stbbdev         // As we do not have grand-ancestors, concurrent state propagation (if any)
15751c0b2f7Stbbdev         // may originate only from the parent context, and thus it is safe to directly
15851c0b2f7Stbbdev         // copy the state from it.
15951c0b2f7Stbbdev         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
16051c0b2f7Stbbdev     }
16151c0b2f7Stbbdev 
16251c0b2f7Stbbdev     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
16351c0b2f7Stbbdev }
16451c0b2f7Stbbdev 
16551c0b2f7Stbbdev void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
166*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
16751c0b2f7Stbbdev     d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
16851c0b2f7Stbbdev     if (state <= d1::task_group_context::lifetime_state::locked) {
16951c0b2f7Stbbdev         if (state == d1::task_group_context::lifetime_state::created &&
17051c0b2f7Stbbdev #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
17151c0b2f7Stbbdev             ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
17251c0b2f7Stbbdev             (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
17351c0b2f7Stbbdev                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
17451c0b2f7Stbbdev #else
17551c0b2f7Stbbdev             ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
17651c0b2f7Stbbdev #endif
17751c0b2f7Stbbdev             ) {
178b15aabb3Stbbdev             // If we are in the outermost task dispatch loop of an external thread, then
17951c0b2f7Stbbdev             // there is nothing to bind this context to, and we skip the binding part
18051c0b2f7Stbbdev             // treating the context as isolated.
18151c0b2f7Stbbdev             __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
18251c0b2f7Stbbdev             if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
18351c0b2f7Stbbdev                 if (!ctx.my_traits.fp_settings) {
18451c0b2f7Stbbdev                     copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
18551c0b2f7Stbbdev                 }
18651c0b2f7Stbbdev                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
18751c0b2f7Stbbdev             } else {
18851c0b2f7Stbbdev                 bind_to_impl(ctx, td);
18951c0b2f7Stbbdev             }
19051c0b2f7Stbbdev             ITT_STACK_CREATE(ctx.my_itt_caller);
19151c0b2f7Stbbdev         }
19251c0b2f7Stbbdev         spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
19351c0b2f7Stbbdev     }
194*35147e00SIlya Isaev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, nullptr);
195*35147e00SIlya Isaev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, nullptr);
19651c0b2f7Stbbdev }
19751c0b2f7Stbbdev 
19851c0b2f7Stbbdev template <typename T>
19951c0b2f7Stbbdev void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
200*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
201*35147e00SIlya Isaev     /*  1. if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state):
202*35147e00SIlya Isaev             Nothing to do, whether descending from "src" or not, so no need to scan.
203*35147e00SIlya Isaev             Hopefully this happens often thanks to earlier invocations.
204*35147e00SIlya Isaev             This optimization is enabled by LIFO order in the context lists:
205*35147e00SIlya Isaev                 - new contexts are bound to the beginning of lists;
206*35147e00SIlya Isaev                 - descendants are newer than ancestors;
207*35147e00SIlya Isaev                 - earlier invocations are therefore likely to "paint" long chains.
208*35147e00SIlya Isaev         2. if (&ctx != &src):
209*35147e00SIlya Isaev             This clause is disjunct from the traversal below, which skips src entirely.
210*35147e00SIlya Isaev             Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
211*35147e00SIlya Isaev             Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
212*35147e00SIlya Isaev             Letting the other thread prevail may also be fairer.
213*35147e00SIlya Isaev     */
214*35147e00SIlya Isaev     if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state && &ctx != &src) {
215*35147e00SIlya Isaev         for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != nullptr; ancestor = ancestor->my_parent) {
21651c0b2f7Stbbdev             if (ancestor == &src) {
21751c0b2f7Stbbdev                 for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
21851c0b2f7Stbbdev                     (c->*mptr_state).store(new_state, std::memory_order_relaxed);
21951c0b2f7Stbbdev                 break;
22051c0b2f7Stbbdev             }
22151c0b2f7Stbbdev         }
22251c0b2f7Stbbdev     }
22351c0b2f7Stbbdev }
22451c0b2f7Stbbdev 
225478de5b1Stbbdev template <typename T>
226478de5b1Stbbdev void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
227*35147e00SIlya Isaev     mutex::scoped_lock lock(my_context_list->m_mutex);
228478de5b1Stbbdev     // Acquire fence is necessary to ensure that the subsequent node->my_next load
229478de5b1Stbbdev     // returned the correct value in case it was just inserted in another thread.
230478de5b1Stbbdev     // The fence also ensures visibility of the correct ctx.my_parent value.
231*35147e00SIlya Isaev     for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) {
232*35147e00SIlya Isaev         d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it));
233478de5b1Stbbdev         if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
234478de5b1Stbbdev             task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
235478de5b1Stbbdev     }
236478de5b1Stbbdev     // Sync up local propagation epoch with the global one. Release fence prevents
237478de5b1Stbbdev     // reordering of possible store to *mptr_state after the sync point.
238*35147e00SIlya Isaev     my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
239478de5b1Stbbdev }
240478de5b1Stbbdev 
241478de5b1Stbbdev template <typename T>
242478de5b1Stbbdev bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
243478de5b1Stbbdev     if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
244478de5b1Stbbdev         return true;
245478de5b1Stbbdev     // The whole propagation algorithm is under the lock in order to ensure correctness
246478de5b1Stbbdev     // in case of concurrent state changes at the different levels of the context tree.
247478de5b1Stbbdev     // See comment at the bottom of scheduler.cpp
248478de5b1Stbbdev     context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
249478de5b1Stbbdev     if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
250478de5b1Stbbdev         // Another thread has concurrently changed the state. Back down.
251478de5b1Stbbdev         return false;
252478de5b1Stbbdev     // Advance global state propagation epoch
253478de5b1Stbbdev     ++the_context_state_propagation_epoch;
254478de5b1Stbbdev     // Propagate to all workers and external threads and sync up their local epochs with the global one
255478de5b1Stbbdev     unsigned num_workers = my_first_unused_worker_idx;
256478de5b1Stbbdev     for (unsigned i = 0; i < num_workers; ++i) {
257478de5b1Stbbdev         thread_data* td = my_workers[i];
258478de5b1Stbbdev         // If the worker is only about to be registered, skip it.
259478de5b1Stbbdev         if (td)
260478de5b1Stbbdev             td->propagate_task_group_state(mptr_state, src, new_state);
261478de5b1Stbbdev     }
262478de5b1Stbbdev     // Propagate to all external threads
263478de5b1Stbbdev     // The whole propagation sequence is locked, thus no contention is expected
264478de5b1Stbbdev     for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
265478de5b1Stbbdev         it->propagate_task_group_state(mptr_state, src, new_state);
266478de5b1Stbbdev     return true;
267478de5b1Stbbdev }
268478de5b1Stbbdev 
26951c0b2f7Stbbdev bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
270*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
27151c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
27251c0b2f7Stbbdev     if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
27351c0b2f7Stbbdev         // This task group and any descendants have already been canceled.
27451c0b2f7Stbbdev         // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
27551c0b2f7Stbbdev         // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
27651c0b2f7Stbbdev         return false;
27751c0b2f7Stbbdev     }
27851c0b2f7Stbbdev     governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
27951c0b2f7Stbbdev     return true;
28051c0b2f7Stbbdev }
28151c0b2f7Stbbdev 
28251c0b2f7Stbbdev bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
28351c0b2f7Stbbdev     return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
28451c0b2f7Stbbdev }
28551c0b2f7Stbbdev 
28651c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently!
28751c0b2f7Stbbdev void task_group_context_impl::reset(d1::task_group_context& ctx) {
288*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
28951c0b2f7Stbbdev     //! TODO: Add assertion that this context does not have children
29051c0b2f7Stbbdev     // No fences are necessary since this context can be accessed from another thread
29151c0b2f7Stbbdev     // only after stealing happened (which means necessary fences were used).
29251c0b2f7Stbbdev     if (ctx.my_exception) {
29351c0b2f7Stbbdev         ctx.my_exception->destroy();
294*35147e00SIlya Isaev         ctx.my_exception = nullptr;
29551c0b2f7Stbbdev     }
29651c0b2f7Stbbdev     ctx.my_cancellation_requested = 0;
29751c0b2f7Stbbdev }
29851c0b2f7Stbbdev 
29951c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently!
30051c0b2f7Stbbdev void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
301*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
30251c0b2f7Stbbdev     //! TODO: Add assertion that this context does not have children
30351c0b2f7Stbbdev     // No fences are necessary since this context can be accessed from another thread
30451c0b2f7Stbbdev     // only after stealing happened (which means necessary fences were used).
30551c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
30651c0b2f7Stbbdev     if (!ctx.my_traits.fp_settings) {
30751c0b2f7Stbbdev         ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
30851c0b2f7Stbbdev         ctx.my_traits.fp_settings = true;
30951c0b2f7Stbbdev     }
31051c0b2f7Stbbdev     ctl->get_env();
31151c0b2f7Stbbdev }
31251c0b2f7Stbbdev 
31351c0b2f7Stbbdev void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
314*35147e00SIlya Isaev     __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr);
31551c0b2f7Stbbdev     __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
31651c0b2f7Stbbdev     __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
31751c0b2f7Stbbdev 
31851c0b2f7Stbbdev     const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
31951c0b2f7Stbbdev     new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
32051c0b2f7Stbbdev     ctx.my_traits.fp_settings = true;
32151c0b2f7Stbbdev }
32251c0b2f7Stbbdev 
32351c0b2f7Stbbdev /*
32451c0b2f7Stbbdev     Comments:
32551c0b2f7Stbbdev 
32651c0b2f7Stbbdev 1.  The premise of the cancellation support implementation is that cancellations are
32751c0b2f7Stbbdev     not part of the hot path of the program execution. Therefore all changes in its
32851c0b2f7Stbbdev     implementation in order to reduce the overhead of the cancellation control flow
32951c0b2f7Stbbdev     should be done only in ways that do not increase overhead of the normal execution.
33051c0b2f7Stbbdev 
33151c0b2f7Stbbdev     In general, contexts are used by all threads and their descendants are created in
33251c0b2f7Stbbdev     different threads as well. In order to minimize impact of the cross-thread tree
33351c0b2f7Stbbdev     maintenance (first of all because of the synchronization), the tree of contexts
33451c0b2f7Stbbdev     is split into pieces, each of which is handled by a single thread. Such pieces
33551c0b2f7Stbbdev     are represented as lists of contexts, members of which are contexts that were
33651c0b2f7Stbbdev     bound to their parents in the given thread.
33751c0b2f7Stbbdev 
33851c0b2f7Stbbdev     The context tree maintenance and cancellation propagation algorithms are designed
33951c0b2f7Stbbdev     in such a manner that cross-thread access to a context list will take place only
34051c0b2f7Stbbdev     when cancellation signal is sent (by user or when an exception happens), and
34151c0b2f7Stbbdev     synchronization is necessary only then. Thus the normal execution flow (without
34251c0b2f7Stbbdev     exceptions and cancellation) remains free from any synchronization done on
34351c0b2f7Stbbdev     behalf of exception handling and cancellation support.
34451c0b2f7Stbbdev 
34551c0b2f7Stbbdev 2.  Consider parallel cancellations at the different levels of the context tree:
34651c0b2f7Stbbdev 
34751c0b2f7Stbbdev         Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
34851c0b2f7Stbbdev          |                                      |
34951c0b2f7Stbbdev         Ctx2                                    |- Thread1 started processing
35051c0b2f7Stbbdev          |                                   T1 |- Thread2 finishes and syncs up local counters
35151c0b2f7Stbbdev         Ctx3 <- Cancelled by Thread2            |
35251c0b2f7Stbbdev          |                                      |- Ctx5 is bound to Ctx2
35351c0b2f7Stbbdev         Ctx4                                    |
35451c0b2f7Stbbdev                                              T2 |- Thread1 reaches Ctx2
35551c0b2f7Stbbdev 
35651c0b2f7Stbbdev     Thread-propagator of each cancellation increments global counter. However the thread
35751c0b2f7Stbbdev     propagating the cancellation from the outermost context (Thread1) may be the last
35851c0b2f7Stbbdev     to finish. Which means that the local counters may be synchronized earlier (by Thread2,
35951c0b2f7Stbbdev     at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
36051c0b2f7Stbbdev     (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
36151c0b2f7Stbbdev     (Ctx2) may result in cancellation request being lost.
36251c0b2f7Stbbdev 
36351c0b2f7Stbbdev     This issue is solved by doing the whole propagation under the lock.
36451c0b2f7Stbbdev 
36551c0b2f7Stbbdev     If we need more concurrency while processing parallel cancellations, we could try
36651c0b2f7Stbbdev     the following modification of the propagation algorithm:
36751c0b2f7Stbbdev 
36851c0b2f7Stbbdev     advance global counter and remember it
36951c0b2f7Stbbdev     for each thread:
37051c0b2f7Stbbdev         scan thread's list of contexts
37151c0b2f7Stbbdev     for each thread:
37251c0b2f7Stbbdev         sync up its local counter only if the global counter has not been changed
37351c0b2f7Stbbdev 
37451c0b2f7Stbbdev     However this version of the algorithm requires more analysis and verification.
37551c0b2f7Stbbdev */
37651c0b2f7Stbbdev 
37751c0b2f7Stbbdev void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
37851c0b2f7Stbbdev     task_group_context_impl::initialize(ctx);
37951c0b2f7Stbbdev }
38051c0b2f7Stbbdev void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
38151c0b2f7Stbbdev     task_group_context_impl::destroy(ctx);
38251c0b2f7Stbbdev }
38351c0b2f7Stbbdev void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
38451c0b2f7Stbbdev     task_group_context_impl::reset(ctx);
38551c0b2f7Stbbdev }
38651c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
38751c0b2f7Stbbdev     return task_group_context_impl::cancel_group_execution(ctx);
38851c0b2f7Stbbdev }
38951c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
39051c0b2f7Stbbdev     return task_group_context_impl::is_group_execution_cancelled(ctx);
39151c0b2f7Stbbdev }
39251c0b2f7Stbbdev void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
39351c0b2f7Stbbdev     task_group_context_impl::capture_fp_settings(ctx);
39451c0b2f7Stbbdev }
39551c0b2f7Stbbdev 
39651c0b2f7Stbbdev } // namespace r1
39751c0b2f7Stbbdev } // namespace detail
39851c0b2f7Stbbdev } // namespace tbb
39951c0b2f7Stbbdev 
400