1 /* 2 Copyright (c) 2005-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #ifndef __TBB_parallel_for_H 18 #define __TBB_parallel_for_H 19 20 #include "detail/_config.h" 21 #include "detail/_namespace_injection.h" 22 #include "detail/_exception.h" 23 #include "detail/_task.h" 24 #include "detail/_small_object_pool.h" 25 #include "profiling.h" 26 27 #include "partitioner.h" 28 #include "blocked_range.h" 29 #include "task_group.h" 30 31 #include <cstddef> 32 #include <new> 33 34 namespace tbb { 35 namespace detail { 36 #if __TBB_CPP20_CONCEPTS_PRESENT 37 inline namespace d0 { 38 39 template <typename Body, typename Range> 40 concept parallel_for_body = std::copy_constructible<Body> && 41 requires( const std::remove_reference_t<Body>& body, Range& range ) { 42 body(range); 43 }; 44 45 template <typename Index> 46 concept parallel_for_index = std::constructible_from<Index, int> && 47 std::copyable<Index> && 48 requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) { 49 { lhs < rhs } -> adaptive_same_as<bool>; 50 { lhs - rhs } -> std::convertible_to<std::size_t>; 51 { lhs + (rhs - lhs) } -> std::convertible_to<Index>; 52 }; 53 54 template <typename Function, typename Index> 55 concept parallel_for_function = requires( const std::remove_reference_t<Function>& func, Index index ) { 56 func(index); 57 }; 58 59 } // namespace d0 60 #endif // __TBB_CPP20_CONCEPTS_PRESENT 61 namespace d1 { 62 63 //! Task type used in parallel_for 64 /** @ingroup algorithms */ 65 template<typename Range, typename Body, typename Partitioner> 66 struct start_for : public task { 67 Range my_range; 68 const Body my_body; 69 node* my_parent; 70 71 typename Partitioner::task_partition_type my_partition; 72 small_object_allocator my_allocator; 73 74 task* execute(execution_data&) override; 75 task* cancel(execution_data&) override; 76 void finalize(const execution_data&); 77 78 //! Constructor for root task. 79 start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : 80 my_range(range), 81 my_body(body), 82 my_partition(partitioner), 83 my_allocator(alloc) {} 84 //! Splitting constructor used to generate children. 85 /** parent_ becomes left child. Newly constructed object is right child. */ 86 start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : 87 my_range(parent_.my_range, get_range_split_object<Range>(split_obj)), 88 my_body(parent_.my_body), 89 my_partition(parent_.my_partition, split_obj), 90 my_allocator(alloc) {} 91 //! Construct right child from the given range as response to the demand. 92 /** parent_ remains left child. Newly constructed object is right child. */ 93 start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : 94 my_range(r), 95 my_body(parent_.my_body), 96 my_partition(parent_.my_partition, split()), 97 my_allocator(alloc) 98 { 99 my_partition.align_depth( d ); 100 } 101 static void run(const Range& range, const Body& body, Partitioner& partitioner) { 102 task_group_context context(PARALLEL_FOR); 103 run(range, body, partitioner, context); 104 } 105 106 static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) { 107 if ( !range.empty() ) { 108 small_object_allocator alloc{}; 109 start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc); 110 111 // defer creation of the wait node until task allocation succeeds 112 wait_node wn; 113 for_task.my_parent = &wn; 114 execute_and_wait(for_task, context, wn.m_wait, context); 115 } 116 } 117 //! Run body for range, serves as callback for partitioner 118 void run_body( Range &r ) { 119 my_body( r ); 120 } 121 122 //! spawn right task, serves as callback for partitioner 123 void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { 124 offer_work_impl(ed, *this, split_obj); 125 } 126 127 //! spawn right task, serves as callback for partitioner 128 void offer_work(const Range& r, depth_t d, execution_data& ed) { 129 offer_work_impl(ed, *this, r, d); 130 } 131 132 private: 133 template <typename... Args> 134 void offer_work_impl(execution_data& ed, Args&&... constructor_args) { 135 // New right child 136 small_object_allocator alloc{}; 137 start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc); 138 139 // New root node as a continuation and ref count. Left and right child attach to the new parent. 140 right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, my_parent, 2, alloc); 141 // Spawn the right sibling 142 right_child.spawn_self(ed); 143 } 144 145 void spawn_self(execution_data& ed) { 146 my_partition.spawn_task(*this, *context(ed)); 147 } 148 }; 149 150 //! fold the tree and deallocate the task 151 template<typename Range, typename Body, typename Partitioner> 152 void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) { 153 // Get the current parent and allocator an object destruction 154 node* parent = my_parent; 155 auto allocator = my_allocator; 156 // Task execution finished - destroy it 157 this->~start_for(); 158 // Unwind the tree decrementing the parent`s reference count 159 160 fold_tree<tree_node>(parent, ed); 161 allocator.deallocate(this, ed); 162 163 } 164 165 //! execute task for parallel_for 166 template<typename Range, typename Body, typename Partitioner> 167 task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) { 168 if (!is_same_affinity(ed)) { 169 my_partition.note_affinity(execution_slot(ed)); 170 } 171 my_partition.check_being_stolen(*this, ed); 172 my_partition.execute(*this, my_range, ed); 173 finalize(ed); 174 return nullptr; 175 } 176 177 //! cancel task for parallel_for 178 template<typename Range, typename Body, typename Partitioner> 179 task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) { 180 finalize(ed); 181 return nullptr; 182 } 183 184 //! Calls the function with values from range [begin, end) with a step provided 185 template<typename Function, typename Index> 186 class parallel_for_body_wrapper : detail::no_assign { 187 const Function &my_func; 188 const Index my_begin; 189 const Index my_step; 190 public: 191 parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step ) 192 : my_func(_func), my_begin(_begin), my_step(_step) {} 193 194 void operator()( const blocked_range<Index>& r ) const { 195 // A set of local variables to help the compiler with vectorization of the following loop. 196 Index b = r.begin(); 197 Index e = r.end(); 198 Index ms = my_step; 199 Index k = my_begin + b*ms; 200 201 #if __INTEL_COMPILER 202 #pragma ivdep 203 #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE 204 #pragma vector always assert 205 #endif 206 #endif 207 for ( Index i = b; i < e; ++i, k += ms ) { 208 my_func( k ); 209 } 210 } 211 }; 212 213 // Requirements on Range concept are documented in blocked_range.h 214 215 /** \page parallel_for_body_req Requirements on parallel_for body 216 Class \c Body implementing the concept of parallel_for body must define: 217 - \code Body::Body( const Body& ); \endcode Copy constructor 218 - \code Body::~Body(); \endcode Destructor 219 - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r. 220 **/ 221 222 /** \name parallel_for 223 See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/ 224 //@{ 225 226 //! Parallel iteration over range with default partitioner. 227 /** @ingroup algorithms **/ 228 template<typename Range, typename Body> 229 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 230 void parallel_for( const Range& range, const Body& body ) { 231 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER()); 232 } 233 234 //! Parallel iteration over range with simple partitioner. 235 /** @ingroup algorithms **/ 236 template<typename Range, typename Body> 237 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 238 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) { 239 start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner); 240 } 241 242 //! Parallel iteration over range with auto_partitioner. 243 /** @ingroup algorithms **/ 244 template<typename Range, typename Body> 245 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 246 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) { 247 start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner); 248 } 249 250 //! Parallel iteration over range with static_partitioner. 251 /** @ingroup algorithms **/ 252 template<typename Range, typename Body> 253 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 254 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) { 255 start_for<Range,Body,const static_partitioner>::run(range,body,partitioner); 256 } 257 258 //! Parallel iteration over range with affinity_partitioner. 259 /** @ingroup algorithms **/ 260 template<typename Range, typename Body> 261 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 262 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) { 263 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner); 264 } 265 266 //! Parallel iteration over range with default partitioner and user-supplied context. 267 /** @ingroup algorithms **/ 268 template<typename Range, typename Body> 269 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 270 void parallel_for( const Range& range, const Body& body, task_group_context& context ) { 271 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context); 272 } 273 274 //! Parallel iteration over range with simple partitioner and user-supplied context. 275 /** @ingroup algorithms **/ 276 template<typename Range, typename Body> 277 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 278 void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) { 279 start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context); 280 } 281 282 //! Parallel iteration over range with auto_partitioner and user-supplied context. 283 /** @ingroup algorithms **/ 284 template<typename Range, typename Body> 285 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 286 void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) { 287 start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context); 288 } 289 290 //! Parallel iteration over range with static_partitioner and user-supplied context. 291 /** @ingroup algorithms **/ 292 template<typename Range, typename Body> 293 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 294 void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) { 295 start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context); 296 } 297 298 //! Parallel iteration over range with affinity_partitioner and user-supplied context. 299 /** @ingroup algorithms **/ 300 template<typename Range, typename Body> 301 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>) 302 void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) { 303 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context); 304 } 305 306 //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner 307 template <typename Index, typename Function, typename Partitioner> 308 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) { 309 if (step <= 0 ) 310 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument 311 else if (first < last) { 312 // Above "else" avoids "potential divide by zero" warning on some platforms 313 Index end = (last - first - Index(1)) / step + Index(1); 314 blocked_range<Index> range(static_cast<Index>(0), end); 315 parallel_for_body_wrapper<Function, Index> body(f, first, step); 316 parallel_for(range, body, partitioner); 317 } 318 } 319 320 //! Parallel iteration over a range of integers with a step provided and default partitioner 321 template <typename Index, typename Function> 322 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 323 void parallel_for(Index first, Index last, Index step, const Function& f) { 324 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner()); 325 } 326 //! Parallel iteration over a range of integers with a step provided and simple partitioner 327 template <typename Index, typename Function> 328 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 329 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) { 330 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner); 331 } 332 //! Parallel iteration over a range of integers with a step provided and auto partitioner 333 template <typename Index, typename Function> 334 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 335 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) { 336 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner); 337 } 338 //! Parallel iteration over a range of integers with a step provided and static partitioner 339 template <typename Index, typename Function> 340 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 341 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) { 342 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner); 343 } 344 //! Parallel iteration over a range of integers with a step provided and affinity partitioner 345 template <typename Index, typename Function> 346 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 347 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) { 348 parallel_for_impl(first, last, step, f, partitioner); 349 } 350 351 //! Parallel iteration over a range of integers with a default step value and default partitioner 352 template <typename Index, typename Function> 353 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 354 void parallel_for(Index first, Index last, const Function& f) { 355 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner()); 356 } 357 //! Parallel iteration over a range of integers with a default step value and simple partitioner 358 template <typename Index, typename Function> 359 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 360 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) { 361 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 362 } 363 //! Parallel iteration over a range of integers with a default step value and auto partitioner 364 template <typename Index, typename Function> 365 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 366 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) { 367 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 368 } 369 //! Parallel iteration over a range of integers with a default step value and static partitioner 370 template <typename Index, typename Function> 371 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 372 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) { 373 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner); 374 } 375 //! Parallel iteration over a range of integers with a default step value and affinity partitioner 376 template <typename Index, typename Function> 377 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 378 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) { 379 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner); 380 } 381 382 //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner 383 template <typename Index, typename Function, typename Partitioner> 384 void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) { 385 if (step <= 0 ) 386 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument 387 else if (first < last) { 388 // Above "else" avoids "potential divide by zero" warning on some platforms 389 Index end = (last - first - Index(1)) / step + Index(1); 390 blocked_range<Index> range(static_cast<Index>(0), end); 391 parallel_for_body_wrapper<Function, Index> body(f, first, step); 392 parallel_for(range, body, partitioner, context); 393 } 394 } 395 396 //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner 397 template <typename Index, typename Function> 398 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 399 void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) { 400 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context); 401 } 402 //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner 403 template <typename Index, typename Function> 404 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 405 void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { 406 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context); 407 } 408 //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner 409 template <typename Index, typename Function> 410 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 411 void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { 412 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context); 413 } 414 //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner 415 template <typename Index, typename Function> 416 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 417 void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) { 418 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context); 419 } 420 //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner 421 template <typename Index, typename Function> 422 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 423 void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { 424 parallel_for_impl(first, last, step, f, partitioner, context); 425 } 426 427 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner 428 template <typename Index, typename Function> 429 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 430 void parallel_for(Index first, Index last, const Function& f, task_group_context &context) { 431 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context); 432 } 433 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner 434 template <typename Index, typename Function> 435 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 436 void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { 437 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 438 } 439 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner 440 template <typename Index, typename Function> 441 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 442 void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { 443 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 444 } 445 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner 446 template <typename Index, typename Function> 447 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 448 void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) { 449 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context); 450 } 451 //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner 452 template <typename Index, typename Function> 453 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>) 454 void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { 455 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context); 456 } 457 // @} 458 459 } // namespace d1 460 } // namespace detail 461 462 inline namespace v1 { 463 using detail::d1::parallel_for; 464 // Split types 465 using detail::split; 466 using detail::proportional_split; 467 } // namespace v1 468 469 } // namespace tbb 470 471 #endif /* __TBB_parallel_for_H */ 472