DeviceRTL/src/Parallelism.cpp

//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Parallel implementation in the GPU. Here is the pattern:
//
//    while (not finished) {
//
//    if (master) {
//      sequential code, decide which par loop to do, or if finished
//     __kmpc_kernel_prepare_parallel() // exec by master only
//    }
//    syncthreads // A
//    __kmpc_kernel_parallel() // exec by all
//    if (this thread is included in the parallel) {
//      switch () for all parallel loops
//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
//    }
//
//
//    The reason we don't exec end_parallel for the threads not included
//    in the parallel loop is that for each barrier in the parallel
//    region, these non-included threads will cycle through the
//    syncthread A. Thus they must preserve their current threadId that
//    is larger than thread in team.
//
//    To make a long story short...
//
//===----------------------------------------------------------------------===//

#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"

using namespace _OMP;

#pragma omp begin declare target device_type(nohost)

namespace {

uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
  uint32_t NThreadsICV =
      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  uint32_t NumThreads = mapping::getBlockSize();

  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
    NumThreads = NThreadsICV;

  // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
  if (NumThreads < mapping::getWarpSize())
    NumThreads = 1;
  else
    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));

  return NumThreads;
}

// Invoke an outlined parallel function unwrapping arguments (up to 32).
void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
                     void **args, int64_t nargs) {
  DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
  switch (nargs) {
#include "generated_microtask_cases.gen"
  default:
    PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
    __builtin_trap();
  }
}

} // namespace

extern "C" {

void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                        int32_t num_threads, int proc_bind, void *fn,
                        void *wrapper_fn, void **args, int64_t nargs) {
  FunctionTracingRAII();

  uint32_t TId = mapping::getThreadIdInBlock();
  // Handle the serialized case first, same for SPMD/non-SPMD.
  if (OMP_UNLIKELY(!if_expr || icv::Level)) {
    state::DateEnvironmentRAII DERAII(ident);
    ++icv::Level;
    invokeMicrotask(TId, 0, fn, args, nargs);
    state::exitDataEnvironment();
    return;
  }

  uint32_t NumThreads = determineNumberOfThreads(num_threads);
  if (mapping::isSPMDMode()) {
    // Avoid the race between the read of the `icv::Level` above and the write
    // below by synchronizing all threads here.
    synchronize::threadsAligned();
    {
      // Note that the order here is important. `icv::Level` has to be updated
      // last or the other updates will cause a thread specific state to be
      // created.
      state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                            1u, TId == 0, ident);
      state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
                                       ident);
      state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident);

      // Synchronize all threads after the main thread (TId == 0) set up the
      // team state properly.
      synchronize::threadsAligned();

      ASSERT(state::ParallelTeamSize == NumThreads);
      ASSERT(icv::ActiveLevel == 1u);
      ASSERT(icv::Level == 1u);

      if (TId < NumThreads)
        invokeMicrotask(TId, 0, fn, args, nargs);

      // Synchronize all threads at the end of a parallel region.
      synchronize::threadsAligned();
    }

    // Synchronize all threads to make sure every thread exits the scope above;
    // otherwise the following assertions and the assumption in
    // __kmpc_target_deinit may not hold.
    synchronize::threadsAligned();

    ASSERT(state::ParallelTeamSize == 1u);
    ASSERT(icv::ActiveLevel == 0u);
    ASSERT(icv::Level == 0u);
    return;
  }

  // We do *not* create a new data environment because all threads in the team
  // that are active are now running this parallel region. They share the
  // TeamState, which has an increase level-var and potentially active-level
  // set, but they do not have individual ThreadStates yet. If they ever
  // modify the ICVs beyond this point a ThreadStates will be allocated.

  bool IsActiveParallelRegion = NumThreads > 1;
  if (!IsActiveParallelRegion) {
    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
    invokeMicrotask(TId, 0, fn, args, nargs);
    return;
  }

  void **GlobalArgs = nullptr;
  if (nargs) {
    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
    switch (nargs) {
    default:
      for (int I = 0; I < nargs; I++)
        GlobalArgs[I] = args[I];
      break;
    case 16:
      GlobalArgs[15] = args[15];
      [[fallthrough]];
    case 15:
      GlobalArgs[14] = args[14];
      [[fallthrough]];
    case 14:
      GlobalArgs[13] = args[13];
      [[fallthrough]];
    case 13:
      GlobalArgs[12] = args[12];
      [[fallthrough]];
    case 12:
      GlobalArgs[11] = args[11];
      [[fallthrough]];
    case 11:
      GlobalArgs[10] = args[10];
      [[fallthrough]];
    case 10:
      GlobalArgs[9] = args[9];
      [[fallthrough]];
    case 9:
      GlobalArgs[8] = args[8];
      [[fallthrough]];
    case 8:
      GlobalArgs[7] = args[7];
      [[fallthrough]];
    case 7:
      GlobalArgs[6] = args[6];
      [[fallthrough]];
    case 6:
      GlobalArgs[5] = args[5];
      [[fallthrough]];
    case 5:
      GlobalArgs[4] = args[4];
      [[fallthrough]];
    case 4:
      GlobalArgs[3] = args[3];
      [[fallthrough]];
    case 3:
      GlobalArgs[2] = args[2];
      [[fallthrough]];
    case 2:
      GlobalArgs[1] = args[1];
      [[fallthrough]];
    case 1:
      GlobalArgs[0] = args[0];
      [[fallthrough]];
    case 0:
      break;
    }
  }

  {
    // Note that the order here is important. `icv::Level` has to be updated
    // last or the other updates will cause a thread specific state to be
    // created.
    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                          1u, true, ident);
    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
                                          (void *)nullptr, true, ident);
    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident);
    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);

    // Master signals work to activate workers.
    synchronize::threads();
    // Master waits for workers to signal.
    synchronize::threads();
  }

  if (nargs)
    __kmpc_end_sharing_variables();
}

__attribute__((noinline)) bool
__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
  FunctionTracingRAII();
  // Work function and arguments for L1 parallel region.
  *WorkFn = state::ParallelRegionFn;

  // If this is the termination signal from the master, quit early.
  if (!*WorkFn)
    return false;

  // Set to true for workers participating in the parallel region.
  uint32_t TId = mapping::getThreadIdInBlock();
  bool ThreadIsActive = TId < state::ParallelTeamSize;
  return ThreadIsActive;
}

__attribute__((noinline)) void __kmpc_kernel_end_parallel() {
  FunctionTracingRAII();
  // In case we have modified an ICV for this thread before a ThreadState was
  // created. We drop it now to not contaminate the next parallel region.
  ASSERT(!mapping::isSPMDMode());
  uint32_t TId = mapping::getThreadIdInBlock();
  state::resetStateForThread(TId);
  ASSERT(!mapping::isSPMDMode());
}

uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
  FunctionTracingRAII();
  return omp_get_level();
}

int32_t __kmpc_global_thread_num(IdentTy *) {
  FunctionTracingRAII();
  return omp_get_thread_num();
}

void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
                           int32_t thread_limit) {
  FunctionTracingRAII();
}

void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
  FunctionTracingRAII();
}
}

#pragma omp end declare target