151c0b2f7Stbbdev /* 2b15aabb3Stbbdev Copyright (c) 2005-2021 Intel Corporation 351c0b2f7Stbbdev 451c0b2f7Stbbdev Licensed under the Apache License, Version 2.0 (the "License"); 551c0b2f7Stbbdev you may not use this file except in compliance with the License. 651c0b2f7Stbbdev You may obtain a copy of the License at 751c0b2f7Stbbdev 851c0b2f7Stbbdev http://www.apache.org/licenses/LICENSE-2.0 951c0b2f7Stbbdev 1051c0b2f7Stbbdev Unless required by applicable law or agreed to in writing, software 1151c0b2f7Stbbdev distributed under the License is distributed on an "AS IS" BASIS, 1251c0b2f7Stbbdev WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1351c0b2f7Stbbdev See the License for the specific language governing permissions and 1451c0b2f7Stbbdev limitations under the License. 1551c0b2f7Stbbdev */ 1651c0b2f7Stbbdev 1749e08aacStbbdev #include "oneapi/tbb/detail/_config.h" 1849e08aacStbbdev #include "oneapi/tbb/tbb_allocator.h" 1949e08aacStbbdev #include "oneapi/tbb/task_group.h" 2051c0b2f7Stbbdev #include "governor.h" 2151c0b2f7Stbbdev #include "thread_data.h" 2251c0b2f7Stbbdev #include "scheduler_common.h" 2351c0b2f7Stbbdev #include "itt_notify.h" 2451c0b2f7Stbbdev #include "task_dispatcher.h" 2551c0b2f7Stbbdev 2651c0b2f7Stbbdev #include <type_traits> 2751c0b2f7Stbbdev 2851c0b2f7Stbbdev namespace tbb { 2951c0b2f7Stbbdev namespace detail { 3051c0b2f7Stbbdev namespace r1 { 3151c0b2f7Stbbdev 3251c0b2f7Stbbdev //------------------------------------------------------------------------ 3351c0b2f7Stbbdev // tbb_exception_ptr 3451c0b2f7Stbbdev //------------------------------------------------------------------------ 3551c0b2f7Stbbdev tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept { 3651c0b2f7Stbbdev tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr)); 3751c0b2f7Stbbdev return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr; 3851c0b2f7Stbbdev } 3951c0b2f7Stbbdev 4051c0b2f7Stbbdev void tbb_exception_ptr::destroy() noexcept { 4151c0b2f7Stbbdev this->~tbb_exception_ptr(); 4251c0b2f7Stbbdev deallocate_memory(this); 4351c0b2f7Stbbdev } 4451c0b2f7Stbbdev 4551c0b2f7Stbbdev void tbb_exception_ptr::throw_self() { 4651c0b2f7Stbbdev if (governor::rethrow_exception_broken()) fix_broken_rethrow(); 4751c0b2f7Stbbdev std::rethrow_exception(my_ptr); 4851c0b2f7Stbbdev } 4951c0b2f7Stbbdev 5051c0b2f7Stbbdev //------------------------------------------------------------------------ 5151c0b2f7Stbbdev // task_group_context 5251c0b2f7Stbbdev //------------------------------------------------------------------------ 5351c0b2f7Stbbdev 5451c0b2f7Stbbdev void task_group_context_impl::destroy(d1::task_group_context& ctx) { 5551c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 56d86ed7fbStbbdev 57d86ed7fbStbbdev auto ctx_lifetime_state = ctx.my_lifetime_state.load(std::memory_order_relaxed); 58d86ed7fbStbbdev __TBB_ASSERT(ctx_lifetime_state != d1::task_group_context::lifetime_state::locked, nullptr); 59d86ed7fbStbbdev 60d86ed7fbStbbdev if (ctx_lifetime_state == d1::task_group_context::lifetime_state::bound) { 6151c0b2f7Stbbdev // The owner can be destroyed at any moment. Access the associate data with caution. 6251c0b2f7Stbbdev thread_data* owner = ctx.my_owner.load(std::memory_order_relaxed); 6351c0b2f7Stbbdev if (governor::is_thread_data_set(owner)) { 6451c0b2f7Stbbdev thread_data::context_list_state& cls = owner->my_context_list_state; 6551c0b2f7Stbbdev // We are the owner, so cls is valid. 6651c0b2f7Stbbdev // Local update of the context list 67*478de5b1Stbbdev std::uintptr_t local_count_snapshot = cls.epoch.load(std::memory_order_acquire); 6851c0b2f7Stbbdev // The sequentially-consistent store to prevent load of nonlocal update flag 6951c0b2f7Stbbdev // from being hoisted before the store to local update flag. 7051c0b2f7Stbbdev cls.local_update = 1; 71*478de5b1Stbbdev if (cls.nonlocal_update.load(std::memory_order_acquire)) { 7251c0b2f7Stbbdev spin_mutex::scoped_lock lock(cls.mutex); 7351c0b2f7Stbbdev ctx.my_node.remove_relaxed(); 7451c0b2f7Stbbdev cls.local_update.store(0, std::memory_order_relaxed); 7551c0b2f7Stbbdev } else { 7651c0b2f7Stbbdev ctx.my_node.remove_relaxed(); 7751c0b2f7Stbbdev // Release fence is necessary so that update of our neighbors in 7851c0b2f7Stbbdev // the context list was committed when possible concurrent destroyer 7951c0b2f7Stbbdev // proceeds after local update flag is reset by the following store. 8051c0b2f7Stbbdev cls.local_update.store(0, std::memory_order_release); 8151c0b2f7Stbbdev if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) { 8251c0b2f7Stbbdev // Another thread was propagating cancellation request when we removed 8351c0b2f7Stbbdev // ourselves from the list. We must ensure that it is not accessing us 8451c0b2f7Stbbdev // when this destructor finishes. We'll be able to acquire the lock 8551c0b2f7Stbbdev // below only after the other thread finishes with us. 8651c0b2f7Stbbdev spin_mutex::scoped_lock lock(cls.mutex); 87*478de5b1Stbbdev } else { 88*478de5b1Stbbdev // TODO: simplify exception propagation mechanism 89*478de5b1Stbbdev std::atomic_thread_fence(std::memory_order_release); 9051c0b2f7Stbbdev } 9151c0b2f7Stbbdev } 9251c0b2f7Stbbdev } else { 9351c0b2f7Stbbdev d1::task_group_context::lifetime_state expected = d1::task_group_context::lifetime_state::bound; 9451c0b2f7Stbbdev if ( 9551c0b2f7Stbbdev #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 9651c0b2f7Stbbdev !((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong( 9751c0b2f7Stbbdev (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)expected, 9851c0b2f7Stbbdev (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked) 9951c0b2f7Stbbdev #else 10051c0b2f7Stbbdev !ctx.my_lifetime_state.compare_exchange_strong(expected, d1::task_group_context::lifetime_state::locked) 10151c0b2f7Stbbdev #endif 10251c0b2f7Stbbdev ) { 10351c0b2f7Stbbdev __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::detached, nullptr); 10451c0b2f7Stbbdev // The "owner" local variable can be a dangling pointer here. Do not access it. 10551c0b2f7Stbbdev owner = nullptr; 106d86ed7fbStbbdev spin_wait_until_eq(ctx.my_owner, nullptr); 10751c0b2f7Stbbdev // It is unsafe to remove the node because its neighbors might be already destroyed. 10851c0b2f7Stbbdev // TODO: reconsider the logic. 10951c0b2f7Stbbdev // ctx.my_node.remove_relaxed(); 110d86ed7fbStbbdev } 111d86ed7fbStbbdev else { 11251c0b2f7Stbbdev __TBB_ASSERT(expected == d1::task_group_context::lifetime_state::bound, nullptr); 11351c0b2f7Stbbdev __TBB_ASSERT(ctx.my_owner.load(std::memory_order_relaxed) != nullptr, nullptr); 11451c0b2f7Stbbdev thread_data::context_list_state& cls = owner->my_context_list_state; 11551c0b2f7Stbbdev __TBB_ASSERT(is_alive(cls.nonlocal_update.load(std::memory_order_relaxed)), "The owner should be alive."); 11651c0b2f7Stbbdev 11751c0b2f7Stbbdev ++cls.nonlocal_update; 11851c0b2f7Stbbdev ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::dying, std::memory_order_release); 11951c0b2f7Stbbdev spin_wait_until_eq(cls.local_update, 0u); 12051c0b2f7Stbbdev { 12151c0b2f7Stbbdev spin_mutex::scoped_lock lock(cls.mutex); 12251c0b2f7Stbbdev ctx.my_node.remove_relaxed(); 12351c0b2f7Stbbdev } 12451c0b2f7Stbbdev --cls.nonlocal_update; 12551c0b2f7Stbbdev } 12651c0b2f7Stbbdev } 12751c0b2f7Stbbdev } 128d86ed7fbStbbdev 129d86ed7fbStbbdev if (ctx_lifetime_state == d1::task_group_context::lifetime_state::detached) { 130d86ed7fbStbbdev spin_wait_until_eq(ctx.my_owner, nullptr); 131d86ed7fbStbbdev } 132d86ed7fbStbbdev 13351c0b2f7Stbbdev d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env); 13451c0b2f7Stbbdev #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER 13551c0b2f7Stbbdev suppress_unused_warning(ctl); 13651c0b2f7Stbbdev #endif 13751c0b2f7Stbbdev ctl->~cpu_ctl_env(); 13851c0b2f7Stbbdev 13951c0b2f7Stbbdev if (ctx.my_exception) 14051c0b2f7Stbbdev ctx.my_exception->destroy(); 14151c0b2f7Stbbdev ITT_STACK_DESTROY(ctx.my_itt_caller); 14251c0b2f7Stbbdev 14351c0b2f7Stbbdev poison_pointer(ctx.my_parent); 14451c0b2f7Stbbdev poison_pointer(ctx.my_owner); 14551c0b2f7Stbbdev poison_pointer(ctx.my_node.next); 14651c0b2f7Stbbdev poison_pointer(ctx.my_node.prev); 14751c0b2f7Stbbdev poison_pointer(ctx.my_exception); 14851c0b2f7Stbbdev poison_pointer(ctx.my_itt_caller); 14951c0b2f7Stbbdev } 15051c0b2f7Stbbdev 15151c0b2f7Stbbdev void task_group_context_impl::initialize(d1::task_group_context& ctx) { 15251c0b2f7Stbbdev ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr); 15351c0b2f7Stbbdev 15451c0b2f7Stbbdev ctx.my_cpu_ctl_env = 0; 15551c0b2f7Stbbdev ctx.my_cancellation_requested = 0; 15651c0b2f7Stbbdev ctx.my_state.store(0, std::memory_order_relaxed); 15751c0b2f7Stbbdev // Set the created state to bound at the first usage. 15851c0b2f7Stbbdev ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::created, std::memory_order_relaxed); 15951c0b2f7Stbbdev ctx.my_parent = nullptr; 16051c0b2f7Stbbdev ctx.my_owner = nullptr; 16151c0b2f7Stbbdev ctx.my_node.next.store(nullptr, std::memory_order_relaxed); 16251c0b2f7Stbbdev ctx.my_node.next.store(nullptr, std::memory_order_relaxed); 16351c0b2f7Stbbdev ctx.my_exception = nullptr; 16451c0b2f7Stbbdev ctx.my_itt_caller = nullptr; 16551c0b2f7Stbbdev 16651c0b2f7Stbbdev static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t"); 16751c0b2f7Stbbdev d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; 16851c0b2f7Stbbdev if (ctx.my_traits.fp_settings) 16951c0b2f7Stbbdev ctl->get_env(); 17051c0b2f7Stbbdev } 17151c0b2f7Stbbdev 17251c0b2f7Stbbdev void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) { 17351c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 17451c0b2f7Stbbdev __TBB_ASSERT(td, NULL); 17551c0b2f7Stbbdev ctx.my_owner.store(td, std::memory_order_relaxed); 17651c0b2f7Stbbdev thread_data::context_list_state& cls = td->my_context_list_state; 17751c0b2f7Stbbdev // state propagation logic assumes new contexts are bound to head of the list 17851c0b2f7Stbbdev ctx.my_node.prev.store(&cls.head, std::memory_order_relaxed); 17951c0b2f7Stbbdev // Notify threads that may be concurrently destroying contexts registered 18051c0b2f7Stbbdev // in this scheduler's list that local list update is underway. 18151c0b2f7Stbbdev // Prevent load of global propagation epoch counter from being hoisted before 18251c0b2f7Stbbdev // speculative stores above, as well as load of nonlocal update flag from 18351c0b2f7Stbbdev // being hoisted before the store to local update flag. 18451c0b2f7Stbbdev cls.local_update = 1; 18551c0b2f7Stbbdev // Finalize local context list update 186*478de5b1Stbbdev if (cls.nonlocal_update.load(std::memory_order_acquire)) { 18751c0b2f7Stbbdev spin_mutex::scoped_lock lock(cls.mutex); 18851c0b2f7Stbbdev d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed); 18951c0b2f7Stbbdev head_next->prev.store(&ctx.my_node, std::memory_order_relaxed); 19051c0b2f7Stbbdev ctx.my_node.next.store(head_next, std::memory_order_relaxed); 19151c0b2f7Stbbdev cls.local_update.store(0, std::memory_order_relaxed); 19251c0b2f7Stbbdev cls.head.next.store(&ctx.my_node, std::memory_order_relaxed); 19351c0b2f7Stbbdev } else { 19451c0b2f7Stbbdev d1::context_list_node* head_next = cls.head.next.load(std::memory_order_relaxed); 19551c0b2f7Stbbdev head_next->prev.store(&ctx.my_node, std::memory_order_relaxed); 19651c0b2f7Stbbdev ctx.my_node.next.store(head_next, std::memory_order_relaxed); 19751c0b2f7Stbbdev cls.local_update.store(0, std::memory_order_release); 19851c0b2f7Stbbdev // Thread-local list of contexts allows concurrent traversal by another thread 19951c0b2f7Stbbdev // while propagating state change. To ensure visibility of ctx.my_node's members 20051c0b2f7Stbbdev // to the concurrently traversing thread, the list's head is updated by means 20151c0b2f7Stbbdev // of store-with-release. 20251c0b2f7Stbbdev cls.head.next.store(&ctx.my_node, std::memory_order_release); 20351c0b2f7Stbbdev } 20451c0b2f7Stbbdev } 20551c0b2f7Stbbdev 20651c0b2f7Stbbdev void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) { 20751c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 20851c0b2f7Stbbdev __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) == d1::task_group_context::lifetime_state::locked, "The context can be bound only under the lock."); 20951c0b2f7Stbbdev __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding"); 21051c0b2f7Stbbdev 21151c0b2f7Stbbdev ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context; 21251c0b2f7Stbbdev __TBB_ASSERT(ctx.my_parent, NULL); 21351c0b2f7Stbbdev 21451c0b2f7Stbbdev // Inherit FPU settings only if the context has not captured FPU settings yet. 21551c0b2f7Stbbdev if (!ctx.my_traits.fp_settings) 21651c0b2f7Stbbdev copy_fp_settings(ctx, *ctx.my_parent); 21751c0b2f7Stbbdev 21851c0b2f7Stbbdev // Condition below prevents unnecessary thrashing parent context's cache line 21951c0b2f7Stbbdev if (ctx.my_parent->my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { 22051c0b2f7Stbbdev ctx.my_parent->my_state.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below 22151c0b2f7Stbbdev } 22251c0b2f7Stbbdev if (ctx.my_parent->my_parent) { 22351c0b2f7Stbbdev // Even if this context were made accessible for state change propagation 22451c0b2f7Stbbdev // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node) 22551c0b2f7Stbbdev // above), it still could be missed if state propagation from a grand-ancestor 22651c0b2f7Stbbdev // was underway concurrently with binding. 22751c0b2f7Stbbdev // Speculative propagation from the parent together with epoch counters 22851c0b2f7Stbbdev // detecting possibility of such a race allow to avoid taking locks when 22951c0b2f7Stbbdev // there is no contention. 23051c0b2f7Stbbdev 23151c0b2f7Stbbdev // Acquire fence is necessary to prevent reordering subsequent speculative 23251c0b2f7Stbbdev // loads of parent state data out of the scope where epoch counters comparison 23351c0b2f7Stbbdev // can reliably validate it. 23451c0b2f7Stbbdev uintptr_t local_count_snapshot = ctx.my_parent->my_owner.load(std::memory_order_relaxed)->my_context_list_state.epoch.load(std::memory_order_acquire); 23551c0b2f7Stbbdev // Speculative propagation of parent's state. The speculation will be 23651c0b2f7Stbbdev // validated by the epoch counters check further on. 23751c0b2f7Stbbdev ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); 23851c0b2f7Stbbdev register_with(ctx, td); // Issues full fence 23951c0b2f7Stbbdev 24051c0b2f7Stbbdev // If no state propagation was detected by the following condition, the above 24151c0b2f7Stbbdev // full fence guarantees that the parent had correct state during speculative 24251c0b2f7Stbbdev // propagation before the fence. Otherwise the propagation from parent is 24351c0b2f7Stbbdev // repeated under the lock. 24451c0b2f7Stbbdev if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) { 24551c0b2f7Stbbdev // Another thread may be propagating state change right now. So resort to lock. 24651c0b2f7Stbbdev context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); 24751c0b2f7Stbbdev ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); 24851c0b2f7Stbbdev } 24951c0b2f7Stbbdev } else { 25051c0b2f7Stbbdev register_with(ctx, td); // Issues full fence 25151c0b2f7Stbbdev // As we do not have grand-ancestors, concurrent state propagation (if any) 25251c0b2f7Stbbdev // may originate only from the parent context, and thus it is safe to directly 25351c0b2f7Stbbdev // copy the state from it. 25451c0b2f7Stbbdev ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); 25551c0b2f7Stbbdev } 25651c0b2f7Stbbdev 25751c0b2f7Stbbdev ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::bound, std::memory_order_release); 25851c0b2f7Stbbdev } 25951c0b2f7Stbbdev 26051c0b2f7Stbbdev void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) { 26151c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 26251c0b2f7Stbbdev d1::task_group_context::lifetime_state state = ctx.my_lifetime_state.load(std::memory_order_acquire); 26351c0b2f7Stbbdev if (state <= d1::task_group_context::lifetime_state::locked) { 26451c0b2f7Stbbdev if (state == d1::task_group_context::lifetime_state::created && 26551c0b2f7Stbbdev #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 26651c0b2f7Stbbdev ((std::atomic<typename std::underlying_type<d1::task_group_context::lifetime_state>::type>&)ctx.my_lifetime_state).compare_exchange_strong( 26751c0b2f7Stbbdev (typename std::underlying_type<d1::task_group_context::lifetime_state>::type&)state, 26851c0b2f7Stbbdev (typename std::underlying_type<d1::task_group_context::lifetime_state>::type)d1::task_group_context::lifetime_state::locked) 26951c0b2f7Stbbdev #else 27051c0b2f7Stbbdev ctx.my_lifetime_state.compare_exchange_strong(state, d1::task_group_context::lifetime_state::locked) 27151c0b2f7Stbbdev #endif 27251c0b2f7Stbbdev ) { 273b15aabb3Stbbdev // If we are in the outermost task dispatch loop of an external thread, then 27451c0b2f7Stbbdev // there is nothing to bind this context to, and we skip the binding part 27551c0b2f7Stbbdev // treating the context as isolated. 27651c0b2f7Stbbdev __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr); 27751c0b2f7Stbbdev if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) { 27851c0b2f7Stbbdev if (!ctx.my_traits.fp_settings) { 27951c0b2f7Stbbdev copy_fp_settings(ctx, *td->my_arena->my_default_ctx); 28051c0b2f7Stbbdev } 28151c0b2f7Stbbdev ctx.my_lifetime_state.store(d1::task_group_context::lifetime_state::isolated, std::memory_order_release); 28251c0b2f7Stbbdev } else { 28351c0b2f7Stbbdev bind_to_impl(ctx, td); 28451c0b2f7Stbbdev } 28551c0b2f7Stbbdev ITT_STACK_CREATE(ctx.my_itt_caller); 28651c0b2f7Stbbdev } 28751c0b2f7Stbbdev spin_wait_while_eq(ctx.my_lifetime_state, d1::task_group_context::lifetime_state::locked); 28851c0b2f7Stbbdev } 28951c0b2f7Stbbdev __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::created, NULL); 29051c0b2f7Stbbdev __TBB_ASSERT(ctx.my_lifetime_state.load(std::memory_order_relaxed) != d1::task_group_context::lifetime_state::locked, NULL); 29151c0b2f7Stbbdev } 29251c0b2f7Stbbdev 29351c0b2f7Stbbdev template <typename T> 29451c0b2f7Stbbdev void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { 29551c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 29651c0b2f7Stbbdev if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state) { 29751c0b2f7Stbbdev // Nothing to do, whether descending from "src" or not, so no need to scan. 29851c0b2f7Stbbdev // Hopefully this happens often thanks to earlier invocations. 29951c0b2f7Stbbdev // This optimization is enabled by LIFO order in the context lists: 30051c0b2f7Stbbdev // - new contexts are bound to the beginning of lists; 30151c0b2f7Stbbdev // - descendants are newer than ancestors; 30251c0b2f7Stbbdev // - earlier invocations are therefore likely to "paint" long chains. 30351c0b2f7Stbbdev } else if (&ctx == &src) { 30451c0b2f7Stbbdev // This clause is disjunct from the traversal below, which skips src entirely. 30551c0b2f7Stbbdev // Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again). 30651c0b2f7Stbbdev // Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down). 30751c0b2f7Stbbdev // Letting the other thread prevail may also be fairer. 30851c0b2f7Stbbdev } else { 30951c0b2f7Stbbdev for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != NULL; ancestor = ancestor->my_parent) { 31051c0b2f7Stbbdev if (ancestor == &src) { 31151c0b2f7Stbbdev for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent) 31251c0b2f7Stbbdev (c->*mptr_state).store(new_state, std::memory_order_relaxed); 31351c0b2f7Stbbdev break; 31451c0b2f7Stbbdev } 31551c0b2f7Stbbdev } 31651c0b2f7Stbbdev } 31751c0b2f7Stbbdev } 31851c0b2f7Stbbdev 319*478de5b1Stbbdev template <typename T> 320*478de5b1Stbbdev void thread_data::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { 321*478de5b1Stbbdev spin_mutex::scoped_lock lock(my_context_list_state.mutex); 322*478de5b1Stbbdev // Acquire fence is necessary to ensure that the subsequent node->my_next load 323*478de5b1Stbbdev // returned the correct value in case it was just inserted in another thread. 324*478de5b1Stbbdev // The fence also ensures visibility of the correct ctx.my_parent value. 325*478de5b1Stbbdev d1::context_list_node* node = my_context_list_state.head.next.load(std::memory_order_acquire); 326*478de5b1Stbbdev while (node != &my_context_list_state.head) { 327*478de5b1Stbbdev d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, node); 328*478de5b1Stbbdev if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state) 329*478de5b1Stbbdev task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state); 330*478de5b1Stbbdev node = node->next.load(std::memory_order_relaxed); 331*478de5b1Stbbdev } 332*478de5b1Stbbdev // Sync up local propagation epoch with the global one. Release fence prevents 333*478de5b1Stbbdev // reordering of possible store to *mptr_state after the sync point. 334*478de5b1Stbbdev my_context_list_state.epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release); 335*478de5b1Stbbdev } 336*478de5b1Stbbdev 337*478de5b1Stbbdev template <typename T> 338*478de5b1Stbbdev bool market::propagate_task_group_state(std::atomic<T> d1::task_group_context::* mptr_state, d1::task_group_context& src, T new_state) { 339*478de5b1Stbbdev if (src.my_state.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) 340*478de5b1Stbbdev return true; 341*478de5b1Stbbdev // The whole propagation algorithm is under the lock in order to ensure correctness 342*478de5b1Stbbdev // in case of concurrent state changes at the different levels of the context tree. 343*478de5b1Stbbdev // See comment at the bottom of scheduler.cpp 344*478de5b1Stbbdev context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); 345*478de5b1Stbbdev if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) 346*478de5b1Stbbdev // Another thread has concurrently changed the state. Back down. 347*478de5b1Stbbdev return false; 348*478de5b1Stbbdev // Advance global state propagation epoch 349*478de5b1Stbbdev ++the_context_state_propagation_epoch; 350*478de5b1Stbbdev // Propagate to all workers and external threads and sync up their local epochs with the global one 351*478de5b1Stbbdev unsigned num_workers = my_first_unused_worker_idx; 352*478de5b1Stbbdev for (unsigned i = 0; i < num_workers; ++i) { 353*478de5b1Stbbdev thread_data* td = my_workers[i]; 354*478de5b1Stbbdev // If the worker is only about to be registered, skip it. 355*478de5b1Stbbdev if (td) 356*478de5b1Stbbdev td->propagate_task_group_state(mptr_state, src, new_state); 357*478de5b1Stbbdev } 358*478de5b1Stbbdev // Propagate to all external threads 359*478de5b1Stbbdev // The whole propagation sequence is locked, thus no contention is expected 360*478de5b1Stbbdev for (thread_data_list_type::iterator it = my_masters.begin(); it != my_masters.end(); it++) 361*478de5b1Stbbdev it->propagate_task_group_state(mptr_state, src, new_state); 362*478de5b1Stbbdev return true; 363*478de5b1Stbbdev } 364*478de5b1Stbbdev 36551c0b2f7Stbbdev bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) { 36651c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 36751c0b2f7Stbbdev __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1"); 36851c0b2f7Stbbdev if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) { 36951c0b2f7Stbbdev // This task group and any descendants have already been canceled. 37051c0b2f7Stbbdev // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested, 37151c0b2f7Stbbdev // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.) 37251c0b2f7Stbbdev return false; 37351c0b2f7Stbbdev } 37451c0b2f7Stbbdev governor::get_thread_data()->my_arena->my_market->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1)); 37551c0b2f7Stbbdev return true; 37651c0b2f7Stbbdev } 37751c0b2f7Stbbdev 37851c0b2f7Stbbdev bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) { 37951c0b2f7Stbbdev return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0; 38051c0b2f7Stbbdev } 38151c0b2f7Stbbdev 38251c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently! 38351c0b2f7Stbbdev void task_group_context_impl::reset(d1::task_group_context& ctx) { 38451c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 38551c0b2f7Stbbdev //! TODO: Add assertion that this context does not have children 38651c0b2f7Stbbdev // No fences are necessary since this context can be accessed from another thread 38751c0b2f7Stbbdev // only after stealing happened (which means necessary fences were used). 38851c0b2f7Stbbdev if (ctx.my_exception) { 38951c0b2f7Stbbdev ctx.my_exception->destroy(); 39051c0b2f7Stbbdev ctx.my_exception = NULL; 39151c0b2f7Stbbdev } 39251c0b2f7Stbbdev ctx.my_cancellation_requested = 0; 39351c0b2f7Stbbdev } 39451c0b2f7Stbbdev 39551c0b2f7Stbbdev // IMPORTANT: It is assumed that this method is not used concurrently! 39651c0b2f7Stbbdev void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) { 39751c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 39851c0b2f7Stbbdev //! TODO: Add assertion that this context does not have children 39951c0b2f7Stbbdev // No fences are necessary since this context can be accessed from another thread 40051c0b2f7Stbbdev // only after stealing happened (which means necessary fences were used). 40151c0b2f7Stbbdev d1::cpu_ctl_env* ctl = reinterpret_cast<d1::cpu_ctl_env*>(&ctx.my_cpu_ctl_env); 40251c0b2f7Stbbdev if (!ctx.my_traits.fp_settings) { 40351c0b2f7Stbbdev ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; 40451c0b2f7Stbbdev ctx.my_traits.fp_settings = true; 40551c0b2f7Stbbdev } 40651c0b2f7Stbbdev ctl->get_env(); 40751c0b2f7Stbbdev } 40851c0b2f7Stbbdev 40951c0b2f7Stbbdev void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) { 41051c0b2f7Stbbdev __TBB_ASSERT(!is_poisoned(ctx.my_owner), NULL); 41151c0b2f7Stbbdev __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings."); 41251c0b2f7Stbbdev __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings."); 41351c0b2f7Stbbdev 41451c0b2f7Stbbdev const d1::cpu_ctl_env* src_ctl = reinterpret_cast<const d1::cpu_ctl_env*>(&src.my_cpu_ctl_env); 41551c0b2f7Stbbdev new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl); 41651c0b2f7Stbbdev ctx.my_traits.fp_settings = true; 41751c0b2f7Stbbdev } 41851c0b2f7Stbbdev 41951c0b2f7Stbbdev /* 42051c0b2f7Stbbdev Comments: 42151c0b2f7Stbbdev 42251c0b2f7Stbbdev 1. The premise of the cancellation support implementation is that cancellations are 42351c0b2f7Stbbdev not part of the hot path of the program execution. Therefore all changes in its 42451c0b2f7Stbbdev implementation in order to reduce the overhead of the cancellation control flow 42551c0b2f7Stbbdev should be done only in ways that do not increase overhead of the normal execution. 42651c0b2f7Stbbdev 42751c0b2f7Stbbdev In general, contexts are used by all threads and their descendants are created in 42851c0b2f7Stbbdev different threads as well. In order to minimize impact of the cross-thread tree 42951c0b2f7Stbbdev maintenance (first of all because of the synchronization), the tree of contexts 43051c0b2f7Stbbdev is split into pieces, each of which is handled by a single thread. Such pieces 43151c0b2f7Stbbdev are represented as lists of contexts, members of which are contexts that were 43251c0b2f7Stbbdev bound to their parents in the given thread. 43351c0b2f7Stbbdev 43451c0b2f7Stbbdev The context tree maintenance and cancellation propagation algorithms are designed 43551c0b2f7Stbbdev in such a manner that cross-thread access to a context list will take place only 43651c0b2f7Stbbdev when cancellation signal is sent (by user or when an exception happens), and 43751c0b2f7Stbbdev synchronization is necessary only then. Thus the normal execution flow (without 43851c0b2f7Stbbdev exceptions and cancellation) remains free from any synchronization done on 43951c0b2f7Stbbdev behalf of exception handling and cancellation support. 44051c0b2f7Stbbdev 44151c0b2f7Stbbdev 2. Consider parallel cancellations at the different levels of the context tree: 44251c0b2f7Stbbdev 44351c0b2f7Stbbdev Ctx1 <- Cancelled by Thread1 |- Thread2 started processing 44451c0b2f7Stbbdev | | 44551c0b2f7Stbbdev Ctx2 |- Thread1 started processing 44651c0b2f7Stbbdev | T1 |- Thread2 finishes and syncs up local counters 44751c0b2f7Stbbdev Ctx3 <- Cancelled by Thread2 | 44851c0b2f7Stbbdev | |- Ctx5 is bound to Ctx2 44951c0b2f7Stbbdev Ctx4 | 45051c0b2f7Stbbdev T2 |- Thread1 reaches Ctx2 45151c0b2f7Stbbdev 45251c0b2f7Stbbdev Thread-propagator of each cancellation increments global counter. However the thread 45351c0b2f7Stbbdev propagating the cancellation from the outermost context (Thread1) may be the last 45451c0b2f7Stbbdev to finish. Which means that the local counters may be synchronized earlier (by Thread2, 45551c0b2f7Stbbdev at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context 45651c0b2f7Stbbdev (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only 45751c0b2f7Stbbdev (Ctx2) may result in cancellation request being lost. 45851c0b2f7Stbbdev 45951c0b2f7Stbbdev This issue is solved by doing the whole propagation under the lock. 46051c0b2f7Stbbdev 46151c0b2f7Stbbdev If we need more concurrency while processing parallel cancellations, we could try 46251c0b2f7Stbbdev the following modification of the propagation algorithm: 46351c0b2f7Stbbdev 46451c0b2f7Stbbdev advance global counter and remember it 46551c0b2f7Stbbdev for each thread: 46651c0b2f7Stbbdev scan thread's list of contexts 46751c0b2f7Stbbdev for each thread: 46851c0b2f7Stbbdev sync up its local counter only if the global counter has not been changed 46951c0b2f7Stbbdev 47051c0b2f7Stbbdev However this version of the algorithm requires more analysis and verification. 47151c0b2f7Stbbdev */ 47251c0b2f7Stbbdev 47351c0b2f7Stbbdev void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) { 47451c0b2f7Stbbdev task_group_context_impl::initialize(ctx); 47551c0b2f7Stbbdev } 47651c0b2f7Stbbdev void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) { 47751c0b2f7Stbbdev task_group_context_impl::destroy(ctx); 47851c0b2f7Stbbdev } 47951c0b2f7Stbbdev void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) { 48051c0b2f7Stbbdev task_group_context_impl::reset(ctx); 48151c0b2f7Stbbdev } 48251c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) { 48351c0b2f7Stbbdev return task_group_context_impl::cancel_group_execution(ctx); 48451c0b2f7Stbbdev } 48551c0b2f7Stbbdev bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) { 48651c0b2f7Stbbdev return task_group_context_impl::is_group_execution_cancelled(ctx); 48751c0b2f7Stbbdev } 48851c0b2f7Stbbdev void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) { 48951c0b2f7Stbbdev task_group_context_impl::capture_fp_settings(ctx); 49051c0b2f7Stbbdev } 49151c0b2f7Stbbdev 49251c0b2f7Stbbdev } // namespace r1 49351c0b2f7Stbbdev } // namespace detail 49451c0b2f7Stbbdev } // namespace tbb 49551c0b2f7Stbbdev 496