1 /* 2 Copyright (c) 2005-2022 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #ifndef __TBB_parallel_for_H 18 #define __TBB_parallel_for_H 19 20 #include "detail/_config.h" 21 #include "detail/_namespace_injection.h" 22 #include "detail/_exception.h" 23 #include "detail/_task.h" 24 #include "detail/_small_object_pool.h" 25 #include "profiling.h" 26 27 #include "partitioner.h" 28 #include "blocked_range.h" 29 #include "task_group.h" 30 31 #include <cstddef> 32 #include <new> 33 34 namespace tbb { 35 namespace detail { 36 #if __TBB_CPP20_CONCEPTS_PRESENT 37 inline namespace d0 { 38 39 template <typename Body, typename Range> 40 concept parallel_for_body = std::copy_constructible<Body> && 41 requires( const std::remove_reference_t<Body>& body, Range& range ) { 42 body(range); 43 }; 44 45 template <typename Index> 46 concept parallel_for_index = std::constructible_from<Index, int> && 47 std::copyable<Index> && 48 requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) { 49 { lhs < rhs } -> adaptive_same_as<bool>; 50 { lhs - rhs } -> std::convertible_to<std::size_t>; 51 { lhs + (rhs - lhs) } -> std::convertible_to<Index>; 52 }; 53 54 template <typename Function, typename Index> 55 concept parallel_for_function = requires( const std::remove_reference_t<Function>& func, Index index ) { 56 func(index); 57 }; 58 59 } // namespace d0 60 #endif // __TBB_CPP20_CONCEPTS_PRESENT 61 namespace d1 { 62 63 //! Task type used in parallel_for 64 /** @ingroup algorithms */ 65 template<typename Range, typename Body, typename Partitioner> 66 struct start_for : public task { 67 Range my_range; 68 const Body my_body; 69 node* my_parent; 70 71 typename Partitioner::task_partition_type my_partition; 72 small_object_allocator my_allocator; 73 74 task* execute(execution_data&) override; 75 task* cancel(execution_data&) override; 76 void finalize(const execution_data&); 77 78 //! Constructor for root task. 79 start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : 80 my_range(range), 81 my_body(body), 82 my_parent(nullptr), 83 my_partition(partitioner), 84 my_allocator(alloc) {} 85 //! Splitting constructor used to generate children. 86 /** parent_ becomes left child. Newly constructed object is right child. */ 87 start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : 88 my_range(parent_.my_range, get_range_split_object<Range>(split_obj)), 89 my_body(parent_.my_body), 90 my_parent(nullptr), 91 my_partition(parent_.my_partition, split_obj), 92 my_allocator(alloc) {} 93 //! Construct right child from the given range as response to the demand. 94 /** parent_ remains left child. Newly constructed object is right child. */ 95 start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : 96 my_range(r), 97 my_body(parent_.my_body), 98 my_parent(nullptr), 99 my_partition(parent_.my_partition, split()), 100 my_allocator(alloc) 101 { 102 my_partition.align_depth( d ); 103 } 104 static void run(const Range& range, const Body& body, Partitioner& partitioner) { 105 task_group_context context(PARALLEL_FOR); 106 run(range, body, partitioner, context); 107 } 108 109 static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) { 110 if ( !range.empty() ) { 111 small_object_allocator alloc{}; 112 start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc); 113 114 // defer creation of the wait node until task allocation succeeds 115 wait_node wn; 116 for_task.my_parent = &wn; 117 execute_and_wait(for_task, context, wn.m_wait, context); 118 } 119 } 120 //! Run body for range, serves as callback for partitioner 121 void run_body( Range &r ) { 122 my_body( r ); 123 } 124 125 //! spawn right task, serves as callback for partitioner 126 void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { 127 offer_work_impl(ed, *this, split_obj); 128 } 129 130 //! spawn right task, serves as callback for partitioner 131 void offer_work(const Range& r, depth_t d, execution_data& ed) { 132 offer_work_impl(ed, *this, r, d); 133 } 134 135 private: 136 template <typename... Args> 137 void offer_work_impl(execution_data& ed, Args&&... constructor_args) { 138 // New right child 139 small_object_allocator alloc{}; 140 start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc); 141 142 // New root node as a continuation and ref count. Left and right child attach to the new parent. 143 right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc); 144 // Spawn the right sibling 145 right_child.spawn_self(ed); 146 } 147 148 void spawn_self(execution_data& ed) { 149 my_partition.spawn_task(*this, *context(ed)); 150 } 151 }; 152 153 //! fold the tree and deallocate the task 154 template<typename Range, typename Body, typename Partitioner> 155 void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) { 156 // Get the current parent and allocator an object destruction 157 node* parent = my_parent; 158 auto allocator = my_allocator; 159 // Task execution finished - destroy it 160 this->~start_for(); 161 // Unwind the tree decrementing the parent`s reference count 162 163 fold_tree<tree_node>(parent, ed); 164 allocator.deallocate(this, ed); 165 166 } 167 168 //! execute task for parallel_for 169 template<typename Range, typename Body, typename Partitioner> 170 task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) { 171 if (!is_same_affinity(ed)) { 172 my_partition.note_affinity(execution_slot(ed)); 173 } 174 my_partition.check_being_stolen(*this, ed); 175 my_partition.execute(*this, my_range, ed); 176 finalize(ed); 177 return nullptr; 178 } 179 180 //! cancel task for parallel_for 181 template<typename Range, typename Body, typename Partitioner> 182 task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) { 183 finalize(ed); 184 return nullptr; 185 } 186 187 //! Calls the function with values from range [begin, end) with a step provided 188 template<typename Function, typename Index> 189 class parallel_for_body_wrapper : detail::no_assign { 190 const Function &my_func; 191 const Index my_begin; 192 const Index my_step; 193 public: 194 parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step ) 195 : my_func(_func), my_begin(_begin), my_step(_step) {} 196 197 void operator()( const blocked_range<Index>& r ) const { 198 // A set of local variables to help the compiler with vectorization of the following loop. 199 Index b = r.begin(); 200 Index e = r.end(); 201 Index ms = my_step; 202 Index k = my_begin + b*ms; 203 204 #if __INTEL_COMPILER 205 #pragma ivdep 206 #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE 207 #pragma vector always assert 208 #endif 209 #endif 210 for ( Index i = b; i < e; ++i, k += ms ) { 211 my_func( k ); 212 } 213 } 214 }; 215 216 // Requirements on Range concept are documented in blocked_range.h 217 218 /** \page parallel_for_body_req Requirements on parallel_for body 219 Class \c Body implementing the concept of parallel_for body must define: 220 - \code Body::Body( const Body& ); \endcode Copy constructor 221 - \code Body::~Body(); \endcode Destructor 222 - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r. 223 **/ 224 225 /** \name parallel_for 226 See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/ 227 //@{ 228 229 //! Parallel iteration over range with default partitioner. 230 /** @ingroup algorithms **/ 231 template<typename Range, typename Body> 232 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 233 void parallel_for( const Range& range, const Body& body ) { 234 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER()); 235 } 236 237 //! Parallel iteration over range with simple partitioner. 238 /** @ingroup algorithms **/ 239 template<typename Range, typename Body> 240 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 241 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) { 242 start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner); 243 } 244 245 //! Parallel iteration over range with auto_partitioner. 246 /** @ingroup algorithms **/ 247 template<typename Range, typename Body> 248 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 249 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) { 250 start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner); 251 } 252 253 //! Parallel iteration over range with static_partitioner. 254 /** @ingroup algorithms **/ 255 template<typename Range, typename Body> 256 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 257 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) { 258 start_for<Range,Body,const static_partitioner>::run(range,body,partitioner); 259 } 260 261 //! Parallel iteration over range with affinity_partitioner. 262 /** @ingroup algorithms **/ 263 template<typename Range, typename Body> 264 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 265 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) { 266 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner); 267 } 268 269 //! Parallel iteration over range with default partitioner and user-supplied context. 270 /** @ingroup algorithms **/ 271 template<typename Range, typename Body> 272 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 273 void parallel_for( const Range& range, const Body& body, task_group_context& context ) { 274 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context); 275 } 276 277 //! Parallel iteration over range with simple partitioner and user-supplied context. 278 /** @ingroup algorithms **/ 279 template<typename Range, typename Body> 280 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 281 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) { 282 start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context); 283 } 284 285 //! Parallel iteration over range with auto_partitioner and user-supplied context. 286 /** @ingroup algorithms **/ 287 template<typename Range, typename Body> 288 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 289 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) { 290 start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context); 291 } 292 293 //! Parallel iteration over range with static_partitioner and user-supplied context. 294 /** @ingroup algorithms **/ 295 template<typename Range, typename Body> 296 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 297 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) { 298 start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context); 299 } 300 301 //! Parallel iteration over range with affinity_partitioner and user-supplied context. 302 /** @ingroup algorithms **/ 303 template<typename Range, typename Body> 304 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 305 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) { 306 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context); 307 } 308 309 //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner 310 template <typename Index, typename Function, typename Partitioner> 311 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) { 312 if (step <= 0 ) 313 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument 314 else if (first < last) { 315 // Above "else" avoids "potential divide by zero" warning on some platforms 316 Index end = (last - first - Index(1)) / step + Index(1); 317 blocked_range<Index> range(static_cast<Index>(0), end); 318 parallel_for_body_wrapper<Function, Index> body(f, first, step); 319 parallel_for(range, body, partitioner); 320 } 321 } 322 323 //! Parallel iteration over a range of integers with a step provided and default partitioner 324 template <typename Index, typename Function> 325 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 326 void parallel_for(Index first, Index last, Index step, const Function& f) { 327 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner()); 328 } 329 //! Parallel iteration over a range of integers with a step provided and simple partitioner 330 template <typename Index, typename Function> 331 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 332 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) { 333 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner); 334 } 335 //! Parallel iteration over a range of integers with a step provided and auto partitioner 336 template <typename Index, typename Function> 337 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 338 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) { 339 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner); 340 } 341 //! Parallel iteration over a range of integers with a step provided and static partitioner 342 template <typename Index, typename Function> 343 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 344 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) { 345 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner); 346 } 347 //! Parallel iteration over a range of integers with a step provided and affinity partitioner 348 template <typename Index, typename Function> 349 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 350 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) { 351 parallel_for_impl(first, last, step, f, partitioner); 352 } 353 354 //! Parallel iteration over a range of integers with a default step value and default partitioner 355 template <typename Index, typename Function> 356 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 357 void parallel_for(Index first, Index last, const Function& f) { 358 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner()); 359 } 360 //! Parallel iteration over a range of integers with a default step value and simple partitioner 361 template <typename Index, typename Function> 362 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 363 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) { 364 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 365 } 366 //! Parallel iteration over a range of integers with a default step value and auto partitioner 367 template <typename Index, typename Function> 368 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 369 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) { 370 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 371 } 372 //! Parallel iteration over a range of integers with a default step value and static partitioner 373 template <typename Index, typename Function> 374 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 375 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) { 376 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 377 } 378 //! Parallel iteration over a range of integers with a default step value and affinity partitioner 379 template <typename Index, typename Function> 380 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 381 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) { 382 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner); 383 } 384 385 //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner 386 template <typename Index, typename Function, typename Partitioner> 387 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) { 388 if (step <= 0 ) 389 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument 390 else if (first < last) { 391 // Above "else" avoids "potential divide by zero" warning on some platforms 392 Index end = (last - first - Index(1)) / step + Index(1); 393 blocked_range<Index> range(static_cast<Index>(0), end); 394 parallel_for_body_wrapper<Function, Index> body(f, first, step); 395 parallel_for(range, body, partitioner, context); 396 } 397 } 398 399 //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner 400 template <typename Index, typename Function> 401 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 402 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) { 403 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context); 404 } 405 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner 406 template <typename Index, typename Function> 407 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 408 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { 409 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context); 410 } 411 //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner 412 template <typename Index, typename Function> 413 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 414 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { 415 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context); 416 } 417 //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner 418 template <typename Index, typename Function> 419 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 420 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) { 421 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context); 422 } 423 //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner 424 template <typename Index, typename Function> 425 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 426 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { 427 parallel_for_impl(first, last, step, f, partitioner, context); 428 } 429 430 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner 431 template <typename Index, typename Function> 432 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 433 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) { 434 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context); 435 } 436 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner 437 template <typename Index, typename Function> 438 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 439 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { 440 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 441 } 442 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner 443 template <typename Index, typename Function> 444 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 445 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { 446 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 447 } 448 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner 449 template <typename Index, typename Function> 450 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 451 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) { 452 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 453 } 454 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner 455 template <typename Index, typename Function> 456 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 457 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { 458 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context); 459 } 460 // @} 461 462 } // namespace d1 463 } // namespace detail 464 465 inline namespace v1 { 466 using detail::d1::parallel_for; 467 // Split types 468 using detail::split; 469 using detail::proportional_split; 470 } // namespace v1 471 472 } // namespace tbb 473 474 #endif /* __TBB_parallel_for_H */ 475