xref: /oneTBB/src/tbb/task_group_context.cpp (revision 49e08aac)
151c0b2f7Stbbdev /*
251c0b2f7Stbbdev     Copyright (c) 2005-2020 Intel Corporation
351c0b2f7Stbbdev 
451c0b2f7Stbbdev     Licensed under the Apache License, Version 2.0 (the "License");
551c0b2f7Stbbdev     you may not use this file except in compliance with the License.
651c0b2f7Stbbdev     You may obtain a copy of the License at
751c0b2f7Stbbdev 
851c0b2f7Stbbdev         http://www.apache.org/licenses/LICENSE-2.0
951c0b2f7Stbbdev 
1051c0b2f7Stbbdev     Unless required by applicable law or agreed to in writing, software
1151c0b2f7Stbbdev     distributed under the License is distributed on an "AS IS" BASIS,
1251c0b2f7Stbbdev     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1351c0b2f7Stbbdev     See the License for the specific language governing permissions and
1451c0b2f7Stbbdev     limitations under the License.
1551c0b2f7Stbbdev */
1651c0b2f7Stbbdev 
17*49e08aacStbbdev #include "oneapi/tbb/detail/_config.h"
18*49e08aacStbbdev #include "oneapi/tbb/tbb_allocator.h"
19*49e08aacStbbdev #include "oneapi/tbb/task_group.h"
2051c0b2f7Stbbdev #include "governor.h"
2151c0b2f7Stbbdev #include "thread_data.h"
2251c0b2f7Stbbdev #include "scheduler_common.h"
2351c0b2f7Stbbdev #include "itt_notify.h"
2451c0b2f7Stbbdev #include "task_dispatcher.h"
2551c0b2f7Stbbdev 
2651c0b2f7Stbbdev #include <type_traits>
2751c0b2f7Stbbdev 
2851c0b2f7Stbbdev namespace tbb {
2951c0b2f7Stbbdev namespace detail {
3051c0b2f7Stbbdev namespace r1 {
3151c0b2f7Stbbdev 
3251c0b2f7Stbbdev //------------------------------------------------------------------------
3351c0b2f7Stbbdev // tbb_exception_ptr
3451c0b2f7Stbbdev //------------------------------------------------------------------------
3551c0b2f7Stbbdev tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept {
3651c0b2f7Stbbdev     tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr));
3751c0b2f7Stbbdev     return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr;
3851c0b2f7Stbbdev }
3951c0b2f7Stbbdev 
4051c0b2f7Stbbdev void tbb_exception_ptr::destroy() noexcept {
4151c0b2f7Stbbdev     this->~tbb_exception_ptr();
4251c0b2f7Stbbdev     deallocate_memory(this);
4351c0b2f7Stbbdev }
4451c0b2f7Stbbdev 
4551c0b2f7Stbbdev void tbb_exception_ptr::throw_self() {
4651c0b2f7Stbbdev     if (governor::rethrow_exception_broken()) fix_broken_rethrow();
4751c0b2f7Stbbdev     std::rethrow_exception(my_ptr);
4851c0b2f7Stbbdev }
4951c0b2f7Stbbdev 
5051c0b2f7Stbbdev //------------------------------------------------------------------------
5151c0b2f7Stbbdev // task_group_context
5251c0b2f7Stbbdev //------------------------------------------------------------------------
5351c0b2f7Stbbdev 
5451c0b2f7Stbbdev void task_group_context_impl::destroy(d1::task_group_context& ctx) {
5551c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
5651c0b2f7Stbbdev     if (ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::bound) {
5751c0b2f7Stbbdev         // The owner can be destroyed at any moment. Access the associate data with caution.
5851c0b2f7Stbbdev         thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed);
5951c0b2f7Stbbdev         if (governor::is_thread_data_set(owner)) {
6051c0b2f7Stbbdev             thread_data::context_list_state& cls = owner->my_context_list_state;
6151c0b2f7Stbbdev             // We are the owner, so cls is valid.
6251c0b2f7Stbbdev             // Local update of the context list
6351c0b2f7Stbbdev             std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_relaxed);
6451c0b2f7Stbbdev             // The sequentially-consistent store to prevent load of nonlocal update flag
6551c0b2f7Stbbdev             // from being hoisted before the store to local update flag.
6651c0b2f7Stbbdev             cls.local_update = 1;
6751c0b2f7Stbbdev             if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
6851c0b2f7Stbbdev                 spin_mutex::scoped_lock lock(cls.mutex);
6951c0b2f7Stbbdev                 ctx.my_node.remove_relaxed();
7051c0b2f7Stbbdev                 cls.local_update.store(0, std::memory_order_relaxed);
7151c0b2f7Stbbdev             } else {
7251c0b2f7Stbbdev                 ctx.my_node.remove_relaxed();
7351c0b2f7Stbbdev                 // Release fence is necessary so that update of our neighbors in
7451c0b2f7Stbbdev                 // the context list was committed when possible concurrent destroyer
7551c0b2f7Stbbdev                 // proceeds after local update flag is reset by the following store.
7651c0b2f7Stbbdev                 cls.local_update.store(0, std::memory_order_release);
7751c0b2f7Stbbdev                 if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
7851c0b2f7Stbbdev                     // Another thread was propagating cancellation request when we removed
7951c0b2f7Stbbdev                     // ourselves from the list. We must ensure that it is not accessing us
8051c0b2f7Stbbdev                     // when this destructor finishes. We'll be able to acquire the lock
8151c0b2f7Stbbdev                     // below only after the other thread finishes with us.
8251c0b2f7Stbbdev                     spin_mutex::scoped_lock lock(cls.mutex);
8351c0b2f7Stbbdev                 }
8451c0b2f7Stbbdev             }
8551c0b2f7Stbbdev         } else {
8651c0b2f7Stbbdev             d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound;
8751c0b2f7Stbbdev             if (
8851c0b2f7Stbbdev #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
8951c0b2f7Stbbdev                 !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
9051c0b2f7Stbbdev                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected,
9151c0b2f7Stbbdev                     (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
9251c0b2f7Stbbdev #else
9351c0b2f7Stbbdev                 !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked)
9451c0b2f7Stbbdev #endif
9551c0b2f7Stbbdev                 ) {
9651c0b2f7Stbbdev                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr);
9751c0b2f7Stbbdev                 // The "owner" local variable can be a dangling pointer here. Do not access it.
9851c0b2f7Stbbdev                 owner = nullptr;
9951c0b2f7Stbbdev                 // It is unsafe to remove the node because its neighbors might be already destroyed.
10051c0b2f7Stbbdev                 // TODO: reconsider the logic.
10151c0b2f7Stbbdev                 // ctx.my_node.remove_relaxed();
10251c0b2f7Stbbdev             } else {
10351c0b2f7Stbbdev                 __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr);
10451c0b2f7Stbbdev                 __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr);
10551c0b2f7Stbbdev                 thread_data::context_list_state& cls = owner->my_context_list_state;
10651c0b2f7Stbbdev                 __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive.");
10751c0b2f7Stbbdev 
10851c0b2f7Stbbdev                 ++cls.nonlocal_update;
10951c0b2f7Stbbdev                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release);
11051c0b2f7Stbbdev                 spin_wait_until_eq(cls.local_update, 0u);
11151c0b2f7Stbbdev                 {
11251c0b2f7Stbbdev                     spin_mutex::scoped_lock lock(cls.mutex);
11351c0b2f7Stbbdev                     ctx.my_node.remove_relaxed();
11451c0b2f7Stbbdev                 }
11551c0b2f7Stbbdev                 --cls.nonlocal_update;
11651c0b2f7Stbbdev             }
11751c0b2f7Stbbdev         }
11851c0b2f7Stbbdev     }
11951c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
12051c0b2f7Stbbdev #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
12151c0b2f7Stbbdev     suppress_unused_warning(ctl);
12251c0b2f7Stbbdev #endif
12351c0b2f7Stbbdev     ctl->~cpu_ctl_env();
12451c0b2f7Stbbdev 
12551c0b2f7Stbbdev     if (ctx.my_exception)
12651c0b2f7Stbbdev         ctx.my_exception->destroy();
12751c0b2f7Stbbdev     ITT_STACK_DESTROY(ctx.my_itt_caller);
12851c0b2f7Stbbdev 
12951c0b2f7Stbbdev     poison_pointer(ctx.my_parent);
13051c0b2f7Stbbdev     poison_pointer(ctx.my_parent);
13151c0b2f7Stbbdev     poison_pointer(ctx.my_owner);
13251c0b2f7Stbbdev     poison_pointer(ctx.my_node.next);
13351c0b2f7Stbbdev     poison_pointer(ctx.my_node.prev);
13451c0b2f7Stbbdev     poison_pointer(ctx.my_exception);
13551c0b2f7Stbbdev     poison_pointer(ctx.my_itt_caller);
13651c0b2f7Stbbdev }
13751c0b2f7Stbbdev 
13851c0b2f7Stbbdev void task_group_context_impl::initialize(d1::task_group_context& ctx) {
13951c0b2f7Stbbdev     ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr);
14051c0b2f7Stbbdev 
14151c0b2f7Stbbdev     ctx.my_cpu_ctl_env = 0;
14251c0b2f7Stbbdev     ctx.my_cancellation_requested = 0;
14351c0b2f7Stbbdev     ctx.my_state.store(0, std::memory_order_relaxed);
14451c0b2f7Stbbdev     // Set the created state to bound at the first usage.
14551c0b2f7Stbbdev     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed);
14651c0b2f7Stbbdev     ctx.my_parent = nullptr;
14751c0b2f7Stbbdev     ctx.my_owner = nullptr;
14851c0b2f7Stbbdev     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
14951c0b2f7Stbbdev     ctx.my_node.next.store(nullptr, std::memory_order_relaxed);
15051c0b2f7Stbbdev     ctx.my_exception = nullptr;
15151c0b2f7Stbbdev     ctx.my_itt_caller = nullptr;
15251c0b2f7Stbbdev 
15351c0b2f7Stbbdev     static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t");
15451c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
15551c0b2f7Stbbdev     if (ctx.my_traits.fp_settings)
15651c0b2f7Stbbdev         ctl->get_env();
15751c0b2f7Stbbdev }
15851c0b2f7Stbbdev 
15951c0b2f7Stbbdev void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) {
16051c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
16151c0b2f7Stbbdev     __TBB_ASSERT(td, NULL);
16251c0b2f7Stbbdev     ctx.my_owner.store(td, std::memory_order_relaxed);
16351c0b2f7Stbbdev     thread_data::context_list_state& cls = td->my_context_list_state;
16451c0b2f7Stbbdev     // state propagation logic assumes new contexts are bound to head of the list
16551c0b2f7Stbbdev     ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed);
16651c0b2f7Stbbdev     // Notify threads that may be concurrently destroying contexts registered
16751c0b2f7Stbbdev     // in this scheduler's list that local list update is underway.
16851c0b2f7Stbbdev     // Prevent load of global propagation epoch counter from being hoisted before
16951c0b2f7Stbbdev     // speculative stores above, as well as load of nonlocal update flag from
17051c0b2f7Stbbdev     // being hoisted before the store to local update flag.
17151c0b2f7Stbbdev     cls.local_update = 1;
17251c0b2f7Stbbdev     // Finalize local context list update
17351c0b2f7Stbbdev     if (cls.nonlocal_update.load(std::memory_order_relaxed)) {
17451c0b2f7Stbbdev         spin_mutex::scoped_lock lock(cls.mutex);
17551c0b2f7Stbbdev         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
17651c0b2f7Stbbdev         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
17751c0b2f7Stbbdev         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
17851c0b2f7Stbbdev         cls.local_update.store(0, std::memory_order_relaxed);
17951c0b2f7Stbbdev         cls.head.next.store(&ctx.my_node, std::memory_order_relaxed);
18051c0b2f7Stbbdev     } else {
18151c0b2f7Stbbdev         d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed);
18251c0b2f7Stbbdev         head_next->prev.store(&ctx.my_node, std::memory_order_relaxed);
18351c0b2f7Stbbdev         ctx.my_node.next.store(head_next, std::memory_order_relaxed);
18451c0b2f7Stbbdev         cls.local_update.store(0, std::memory_order_release);
18551c0b2f7Stbbdev         // Thread-local list of contexts allows concurrent traversal by another thread
18651c0b2f7Stbbdev         // while propagating state change. To ensure visibility of ctx.my_node's members
18751c0b2f7Stbbdev         // to the concurrently traversing thread, the list's head is updated by means
18851c0b2f7Stbbdev         // of store-with-release.
18951c0b2f7Stbbdev         cls.head.next.store(&ctx.my_node, std::memory_order_release);
19051c0b2f7Stbbdev     }
19151c0b2f7Stbbdev }
19251c0b2f7Stbbdev 
19351c0b2f7Stbbdev void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) {
19451c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
19551c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock.");
19651c0b2f7Stbbdev     __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding");
19751c0b2f7Stbbdev 
19851c0b2f7Stbbdev     ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context;
19951c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_parent, NULL);
20051c0b2f7Stbbdev 
20151c0b2f7Stbbdev     // Inherit FPU settings only if the context has not captured FPU settings yet.
20251c0b2f7Stbbdev     if (!ctx.my_traits.fp_settings)
20351c0b2f7Stbbdev         copy_fp_settings(ctx, *ctx.my_parent);
20451c0b2f7Stbbdev 
20551c0b2f7Stbbdev     // Condition below prevents unnecessary thrashing parent context's cache line
20651c0b2f7Stbbdev     if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) {
20751c0b2f7Stbbdev         ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below
20851c0b2f7Stbbdev     }
20951c0b2f7Stbbdev     if (ctx.my_parent->my_parent) {
21051c0b2f7Stbbdev         // Even if this context were made accessible for state change propagation
21151c0b2f7Stbbdev         // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node)
21251c0b2f7Stbbdev         // above), it still could be missed if state propagation from a grand-ancestor
21351c0b2f7Stbbdev         // was underway concurrently with binding.
21451c0b2f7Stbbdev         // Speculative propagation from the parent together with epoch counters
21551c0b2f7Stbbdev         // detecting possibility of such a race allow to avoid taking locks when
21651c0b2f7Stbbdev         // there is no contention.
21751c0b2f7Stbbdev 
21851c0b2f7Stbbdev         // Acquire fence is necessary to prevent reordering subsequent speculative
21951c0b2f7Stbbdev         // loads of parent state data out of the scope where epoch counters comparison
22051c0b2f7Stbbdev         // can reliably validate it.
22151c0b2f7Stbbdev         uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire);
22251c0b2f7Stbbdev         // Speculative propagation of parent's state. The speculation will be
22351c0b2f7Stbbdev         // validated by the epoch counters check further on.
22451c0b2f7Stbbdev         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
22551c0b2f7Stbbdev         register_with(ctx, td); // Issues full fence
22651c0b2f7Stbbdev 
22751c0b2f7Stbbdev         // If no state propagation was detected by the following condition, the above
22851c0b2f7Stbbdev         // full fence guarantees that the parent had correct state during speculative
22951c0b2f7Stbbdev         // propagation before the fence. Otherwise the propagation from parent is
23051c0b2f7Stbbdev         // repeated under the lock.
23151c0b2f7Stbbdev         if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) {
23251c0b2f7Stbbdev             // Another thread may be propagating state change right now. So resort to lock.
23351c0b2f7Stbbdev             context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
23451c0b2f7Stbbdev             ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
23551c0b2f7Stbbdev         }
23651c0b2f7Stbbdev     } else {
23751c0b2f7Stbbdev         register_with(ctx, td); // Issues full fence
23851c0b2f7Stbbdev         // As we do not have grand-ancestors, concurrent state propagation (if any)
23951c0b2f7Stbbdev         // may originate only from the parent context, and thus it is safe to directly
24051c0b2f7Stbbdev         // copy the state from it.
24151c0b2f7Stbbdev         ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed);
24251c0b2f7Stbbdev     }
24351c0b2f7Stbbdev 
24451c0b2f7Stbbdev     ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release);
24551c0b2f7Stbbdev }
24651c0b2f7Stbbdev 
24751c0b2f7Stbbdev void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) {
24851c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
24951c0b2f7Stbbdev     d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire);
25051c0b2f7Stbbdev     if (state <= d1::task_group_context::lifetime_state::locked) {
25151c0b2f7Stbbdev         if (state == d1::task_group_context::lifetime_state::created &&
25251c0b2f7Stbbdev #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910
25351c0b2f7Stbbdev             ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong(
25451c0b2f7Stbbdev             (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state,
25551c0b2f7Stbbdev                 (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked)
25651c0b2f7Stbbdev #else
25751c0b2f7Stbbdev             ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked)
25851c0b2f7Stbbdev #endif
25951c0b2f7Stbbdev             ) {
26051c0b2f7Stbbdev             // If we are in the outermost task dispatch loop of a master thread, then
26151c0b2f7Stbbdev             // there is nothing to bind this context to, and we skip the binding part
26251c0b2f7Stbbdev             // treating the context as isolated.
26351c0b2f7Stbbdev             __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr);
26451c0b2f7Stbbdev             if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) {
26551c0b2f7Stbbdev                 if (!ctx.my_traits.fp_settings) {
26651c0b2f7Stbbdev                     copy_fp_settings(ctx, *td->my_arena->my_default_ctx);
26751c0b2f7Stbbdev                 }
26851c0b2f7Stbbdev                 ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release);
26951c0b2f7Stbbdev             } else {
27051c0b2f7Stbbdev                 bind_to_impl(ctx, td);
27151c0b2f7Stbbdev             }
27251c0b2f7Stbbdev             ITT_STACK_CREATE(ctx.my_itt_caller);
27351c0b2f7Stbbdev         }
27451c0b2f7Stbbdev         spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked);
27551c0b2f7Stbbdev     }
27651c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL);
27751c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL);
27851c0b2f7Stbbdev }
27951c0b2f7Stbbdev 
28051c0b2f7Stbbdev template <typename T>
28151c0b2f7Stbbdev void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
28251c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
28351c0b2f7Stbbdev     if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) {
28451c0b2f7Stbbdev         // Nothing to do, whether descending from "src" or not, so no need to scan.
28551c0b2f7Stbbdev         // Hopefully this happens often thanks to earlier invocations.
28651c0b2f7Stbbdev         // This optimization is enabled by LIFO order in the context lists:
28751c0b2f7Stbbdev         // - new contexts are bound to the beginning of lists;
28851c0b2f7Stbbdev         // - descendants are newer than ancestors;
28951c0b2f7Stbbdev         // - earlier invocations are therefore likely to "paint" long chains.
29051c0b2f7Stbbdev     } else if (&ctx == &src) {
29151c0b2f7Stbbdev         // This clause is disjunct from the traversal below, which skips src entirely.
29251c0b2f7Stbbdev         // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again).
29351c0b2f7Stbbdev         // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down).
29451c0b2f7Stbbdev         // Letting the other thread prevail may also be fairer.
29551c0b2f7Stbbdev     } else {
29651c0b2f7Stbbdev         for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) {
29751c0b2f7Stbbdev             if (ancestor == &src) {
29851c0b2f7Stbbdev                 for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent)
29951c0b2f7Stbbdev                     (c->*mptr_state).store(new_state, std::memory_order_relaxed);
30051c0b2f7Stbbdev                 break;
30151c0b2f7Stbbdev             }
30251c0b2f7Stbbdev         }
30351c0b2f7Stbbdev     }
30451c0b2f7Stbbdev }
30551c0b2f7Stbbdev 
30651c0b2f7Stbbdev bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) {
30751c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
30851c0b2f7Stbbdev     __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1");
30951c0b2f7Stbbdev     if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) {
31051c0b2f7Stbbdev         // This task group and any descendants have already been canceled.
31151c0b2f7Stbbdev         // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested,
31251c0b2f7Stbbdev         // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.)
31351c0b2f7Stbbdev         return false;
31451c0b2f7Stbbdev     }
31551c0b2f7Stbbdev     governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1));
31651c0b2f7Stbbdev     return true;
31751c0b2f7Stbbdev }
31851c0b2f7Stbbdev 
31951c0b2f7Stbbdev bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) {
32051c0b2f7Stbbdev     return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0;
32151c0b2f7Stbbdev }
32251c0b2f7Stbbdev 
32351c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently!
32451c0b2f7Stbbdev void task_group_context_impl::reset(d1::task_group_context& ctx) {
32551c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
32651c0b2f7Stbbdev     //! TODO: Add assertion that this context does not have children
32751c0b2f7Stbbdev     // No fences are necessary since this context can be accessed from another thread
32851c0b2f7Stbbdev     // only after stealing happened (which means necessary fences were used).
32951c0b2f7Stbbdev     if (ctx.my_exception) {
33051c0b2f7Stbbdev         ctx.my_exception->destroy();
33151c0b2f7Stbbdev         ctx.my_exception = NULL;
33251c0b2f7Stbbdev     }
33351c0b2f7Stbbdev     ctx.my_cancellation_requested = 0;
33451c0b2f7Stbbdev }
33551c0b2f7Stbbdev 
33651c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently!
33751c0b2f7Stbbdev void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) {
33851c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
33951c0b2f7Stbbdev     //! TODO: Add assertion that this context does not have children
34051c0b2f7Stbbdev     // No fences are necessary since this context can be accessed from another thread
34151c0b2f7Stbbdev     // only after stealing happened (which means necessary fences were used).
34251c0b2f7Stbbdev     d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env);
34351c0b2f7Stbbdev     if (!ctx.my_traits.fp_settings) {
34451c0b2f7Stbbdev         ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env;
34551c0b2f7Stbbdev         ctx.my_traits.fp_settings = true;
34651c0b2f7Stbbdev     }
34751c0b2f7Stbbdev     ctl->get_env();
34851c0b2f7Stbbdev }
34951c0b2f7Stbbdev 
35051c0b2f7Stbbdev void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) {
35151c0b2f7Stbbdev     __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL);
35251c0b2f7Stbbdev     __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings.");
35351c0b2f7Stbbdev     __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings.");
35451c0b2f7Stbbdev 
35551c0b2f7Stbbdev     const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env);
35651c0b2f7Stbbdev     new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl);
35751c0b2f7Stbbdev     ctx.my_traits.fp_settings = true;
35851c0b2f7Stbbdev }
35951c0b2f7Stbbdev 
36051c0b2f7Stbbdev template <typename T>
36151c0b2f7Stbbdev void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
36251c0b2f7Stbbdev     spin_mutex::scoped_lock lock(my_context_list_state.mutex);
36351c0b2f7Stbbdev     // Acquire fence is necessary to ensure that the subsequent node->my_next load
36451c0b2f7Stbbdev     // returned the correct value in case it was just inserted in another thread.
36551c0b2f7Stbbdev     // The fence also ensures visibility of the correct ctx.my_parent value.
36651c0b2f7Stbbdev     d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire);
36751c0b2f7Stbbdev     while (node != &my_context_list_state.head) {
36851c0b2f7Stbbdev         d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node);
36951c0b2f7Stbbdev         if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state)
37051c0b2f7Stbbdev             task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state);
37151c0b2f7Stbbdev         node = node->next.load(std::memory_order_relaxed);
37251c0b2f7Stbbdev     }
37351c0b2f7Stbbdev     // Sync up local propagation epoch with the global one. Release fence prevents
37451c0b2f7Stbbdev     // reordering of possible store to *mptr_state after the sync point.
37551c0b2f7Stbbdev     my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release);
37651c0b2f7Stbbdev }
37751c0b2f7Stbbdev 
37851c0b2f7Stbbdev template <typename T>
37951c0b2f7Stbbdev bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) {
38051c0b2f7Stbbdev     if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children)
38151c0b2f7Stbbdev         return true;
38251c0b2f7Stbbdev     // The whole propagation algorithm is under the lock in order to ensure correctness
38351c0b2f7Stbbdev     // in case of concurrent state changes at the different levels of the context tree.
38451c0b2f7Stbbdev     // See comment at the bottom of scheduler.cpp
38551c0b2f7Stbbdev     context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex);
38651c0b2f7Stbbdev     if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state)
38751c0b2f7Stbbdev         // Another thread has concurrently changed the state. Back down.
38851c0b2f7Stbbdev         return false;
38951c0b2f7Stbbdev     // Advance global state propagation epoch
39051c0b2f7Stbbdev     ++the_context_state_propagation_epoch;
39151c0b2f7Stbbdev     // Propagate to all workers and masters and sync up their local epochs with the global one
39251c0b2f7Stbbdev     unsigned num_workers = my_first_unused_worker_idx;
39351c0b2f7Stbbdev     for (unsigned i = 0; i < num_workers; ++i) {
39451c0b2f7Stbbdev         thread_data* td = my_workers[i];
39551c0b2f7Stbbdev         // If the worker is only about to be registered, skip it.
39651c0b2f7Stbbdev         if (td)
39751c0b2f7Stbbdev             td->propagate_task_group_state(mptr_state, src, new_state);
39851c0b2f7Stbbdev     }
39951c0b2f7Stbbdev     // Propagate to all master threads
40051c0b2f7Stbbdev     // The whole propagation sequence is locked, thus no contention is expected
40151c0b2f7Stbbdev     for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++)
40251c0b2f7Stbbdev         it->propagate_task_group_state(mptr_state, src, new_state);
40351c0b2f7Stbbdev     return true;
40451c0b2f7Stbbdev }
40551c0b2f7Stbbdev 
40651c0b2f7Stbbdev /*
40751c0b2f7Stbbdev     Comments:
40851c0b2f7Stbbdev 
40951c0b2f7Stbbdev 1.  The premise of the cancellation support implementation is that cancellations are
41051c0b2f7Stbbdev     not part of the hot path of the program execution. Therefore all changes in its
41151c0b2f7Stbbdev     implementation in order to reduce the overhead of the cancellation control flow
41251c0b2f7Stbbdev     should be done only in ways that do not increase overhead of the normal execution.
41351c0b2f7Stbbdev 
41451c0b2f7Stbbdev     In general, contexts are used by all threads and their descendants are created in
41551c0b2f7Stbbdev     different threads as well. In order to minimize impact of the cross-thread tree
41651c0b2f7Stbbdev     maintenance (first of all because of the synchronization), the tree of contexts
41751c0b2f7Stbbdev     is split into pieces, each of which is handled by a single thread. Such pieces
41851c0b2f7Stbbdev     are represented as lists of contexts, members of which are contexts that were
41951c0b2f7Stbbdev     bound to their parents in the given thread.
42051c0b2f7Stbbdev 
42151c0b2f7Stbbdev     The context tree maintenance and cancellation propagation algorithms are designed
42251c0b2f7Stbbdev     in such a manner that cross-thread access to a context list will take place only
42351c0b2f7Stbbdev     when cancellation signal is sent (by user or when an exception happens), and
42451c0b2f7Stbbdev     synchronization is necessary only then. Thus the normal execution flow (without
42551c0b2f7Stbbdev     exceptions and cancellation) remains free from any synchronization done on
42651c0b2f7Stbbdev     behalf of exception handling and cancellation support.
42751c0b2f7Stbbdev 
42851c0b2f7Stbbdev 2.  Consider parallel cancellations at the different levels of the context tree:
42951c0b2f7Stbbdev 
43051c0b2f7Stbbdev         Ctx1 <- Cancelled by Thread1            |- Thread2 started processing
43151c0b2f7Stbbdev          |                                      |
43251c0b2f7Stbbdev         Ctx2                                    |- Thread1 started processing
43351c0b2f7Stbbdev          |                                   T1 |- Thread2 finishes and syncs up local counters
43451c0b2f7Stbbdev         Ctx3 <- Cancelled by Thread2            |
43551c0b2f7Stbbdev          |                                      |- Ctx5 is bound to Ctx2
43651c0b2f7Stbbdev         Ctx4                                    |
43751c0b2f7Stbbdev                                              T2 |- Thread1 reaches Ctx2
43851c0b2f7Stbbdev 
43951c0b2f7Stbbdev     Thread-propagator of each cancellation increments global counter. However the thread
44051c0b2f7Stbbdev     propagating the cancellation from the outermost context (Thread1) may be the last
44151c0b2f7Stbbdev     to finish. Which means that the local counters may be synchronized earlier (by Thread2,
44251c0b2f7Stbbdev     at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context
44351c0b2f7Stbbdev     (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only
44451c0b2f7Stbbdev     (Ctx2) may result in cancellation request being lost.
44551c0b2f7Stbbdev 
44651c0b2f7Stbbdev     This issue is solved by doing the whole propagation under the lock.
44751c0b2f7Stbbdev 
44851c0b2f7Stbbdev     If we need more concurrency while processing parallel cancellations, we could try
44951c0b2f7Stbbdev     the following modification of the propagation algorithm:
45051c0b2f7Stbbdev 
45151c0b2f7Stbbdev     advance global counter and remember it
45251c0b2f7Stbbdev     for each thread:
45351c0b2f7Stbbdev         scan thread's list of contexts
45451c0b2f7Stbbdev     for each thread:
45551c0b2f7Stbbdev         sync up its local counter only if the global counter has not been changed
45651c0b2f7Stbbdev 
45751c0b2f7Stbbdev     However this version of the algorithm requires more analysis and verification.
45851c0b2f7Stbbdev */
45951c0b2f7Stbbdev 
46051c0b2f7Stbbdev void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) {
46151c0b2f7Stbbdev     task_group_context_impl::initialize(ctx);
46251c0b2f7Stbbdev }
46351c0b2f7Stbbdev void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) {
46451c0b2f7Stbbdev     task_group_context_impl::destroy(ctx);
46551c0b2f7Stbbdev }
46651c0b2f7Stbbdev void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) {
46751c0b2f7Stbbdev     task_group_context_impl::reset(ctx);
46851c0b2f7Stbbdev }
46951c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) {
47051c0b2f7Stbbdev     return task_group_context_impl::cancel_group_execution(ctx);
47151c0b2f7Stbbdev }
47251c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) {
47351c0b2f7Stbbdev     return task_group_context_impl::is_group_execution_cancelled(ctx);
47451c0b2f7Stbbdev }
47551c0b2f7Stbbdev void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) {
47651c0b2f7Stbbdev     task_group_context_impl::capture_fp_settings(ctx);
47751c0b2f7Stbbdev }
47851c0b2f7Stbbdev 
47951c0b2f7Stbbdev } // namespace r1
48051c0b2f7Stbbdev } // namespace detail
48151c0b2f7Stbbdev } // namespace tbb
48251c0b2f7Stbbdev 
483