openmp/libomptarget/deviceRTLs/common/src/parallel.cu - rust-lang/llvm-project - Git at Google

 //===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Parallel implementation in the GPU. Here is the pattern:
 //
 //    while (not finished) {
 //
 //    if (master) {
 //      sequential code, decide which par loop to do, or if finished
 //     __kmpc_kernel_prepare_parallel() // exec by master only
 //    }
 //    syncthreads // A
 //    __kmpc_kernel_parallel() // exec by all
 //    if (this thread is included in the parallel) {
 //      switch () for all parallel loops
 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
 //    }
 //
 //
 //    The reason we don't exec end_parallel for the threads not included
 //    in the parallel loop is that for each barrier in the parallel
 //    region, these non-included threads will cycle through the
 //    syncthread A. Thus they must preserve their current threadId that
 //    is larger than thread in team.
 //
 //    To make a long story short...
 //
 //===----------------------------------------------------------------------===//
 #pragma omp declare target

 #include "common/omptarget.h"
 #include "target_impl.h"

 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////

 INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
                                                 uint16_t NThreadsICV,
                                                 uint16_t ThreadLimit) {
   uint16_t ThreadsRequested = NThreadsICV;
   if (NumThreadsClause != 0) {
     ThreadsRequested = NumThreadsClause;
   }

   uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
   if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
     ThreadsAvailable = ThreadLimit;
   }

   uint16_t NumThreads = ThreadsAvailable;
   if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
     NumThreads = ThreadsRequested;
   }

 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   // On Volta and newer architectures we require that all lanes in
   // a warp participate in the parallel region.  Round down to a
   // multiple of WARPSIZE since it is legal to do so in OpenMP.
   if (NumThreads < WARPSIZE) {
     NumThreads = 1;
   } else {
     NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
   }
 #endif

   return NumThreads;
 }

 // This routine is always called by the team master..
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");

   omptarget_nvptx_workFn = WorkFn;

   // This routine is only called by the team master.  The team master is
   // the first thread of the last warp.  It always has the logical thread
   // id of 0 (since it is a shadow for the first worker thread).
   const int threadId = 0;
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
   ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
           "cannot be called in a parallel region.");
   if (currTaskDescr->InParallelRegion()) {
     PRINT0(LD_PAR, "already in parallel: go seq\n");
     return;
   }

   uint16_t &NumThreadsClause =
       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);

   uint16_t NumThreads =
       determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);

   if (NumThreadsClause != 0) {
     // Reset request to avoid propagating to successive #parallel
     NumThreadsClause = 0;
   }

   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
          (int)NumThreads);
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "only team master can create parallel");

   // Set number of threads on work descriptor.
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
   threadsInTeam = NumThreads;
 }

 // All workers call this function.  Deactivate those not needed.
 // Fn - the outlined work function to execute.
 // returns True if this thread is active, else False.
 //
 // Only the worker threads call this routine.
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");

   // Work function and arguments for L1 parallel region.
   *WorkFn = omptarget_nvptx_workFn;

   // If this is the termination signal from the master, quit early.
   if (!*WorkFn) {
     PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
     return false;
   }

   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
   int threadId = GetThreadIdInBlock();
   omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
   // Set to true for workers participating in the parallel region.
   bool isActive = false;
   // Initialize state for active threads.
   if (threadId < threadsInTeam) {
     // init work descriptor from workdesccr
     omptarget_nvptx_TaskDescr *newTaskDescr =
         omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
     ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
     newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
     // install new top descriptor
     omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                                newTaskDescr);
     // init private from int value
     PRINT(LD_PAR,
           "thread will execute parallel region with id %d in a team of "
           "%d threads\n",
           (int)newTaskDescr->ThreadId(), (int)nThreads);

     isActive = true;
     // Reconverge the threads at the end of the parallel region to correctly
     // handle parallel levels.
     // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
     // warp. If only 1 thread is active, not need to reconverge the threads.
     // If we have the whole warp, reconverge all the threads in the warp before
     // actually trying to change the parallel level. Otherwise, parallel level
     // can be changed incorrectly because of threads divergence.
     bool IsActiveParallelRegion = threadsInTeam != 1;
     IncParallelLevel(IsActiveParallelRegion,
                      IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
   }

   return isActive;
 }

 EXTERN void __kmpc_kernel_end_parallel() {
   // pop stack
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
   int threadId = GetThreadIdInBlock();
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());

   // Reconverge the threads at the end of the parallel region to correctly
   // handle parallel levels.
   // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
   // warp. If only 1 thread is active, not need to reconverge the threads.
   // If we have the whole warp, reconverge all the threads in the warp before
   // actually trying to change the parallel level. Otherwise, parallel level can
   // be changed incorrectly because of threads divergence.
     bool IsActiveParallelRegion = threadsInTeam != 1;
     DecParallelLevel(IsActiveParallelRegion,
                      IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
 }

 ////////////////////////////////////////////////////////////////////////////////
 // support for parallel that goes sequential
 ////////////////////////////////////////////////////////////////////////////////

 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");

   IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
             "Expected SPMD mode with uninitialized runtime.");
     return;
   }

   // assume this is only called for nested parallel
   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));

   // unlike actual parallel, threads in the same team do not share
   // the workTaskDescr in this case and num threads is fixed to 1

   // get current task
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   currTaskDescr->SaveLoopData();

   // allocate new task descriptor and copy value from current one, set prev to
   // it
   omptarget_nvptx_TaskDescr *newTaskDescr =
       (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
                                               "new seq parallel task");
   newTaskDescr->CopyParent(currTaskDescr);

   // tweak values for serialized parallel case:
   // - each thread becomes ID 0 in its serialized parallel, and
   // - there is only one thread per team
   newTaskDescr->ThreadId() = 0;

   // set new task descriptor as top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                              newTaskDescr);
 }

 EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
                                            uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");

   DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
             "Expected SPMD mode with uninitialized runtime.");
     return;
   }

   // pop stack
   int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   // set new top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
   // free
   SafeFree(currTaskDescr, "new seq parallel task");
   currTaskDescr = getMyTopTaskDescriptor(threadId);
   currTaskDescr->RestoreLoopData();
 }

 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");

   return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
 }

 // This kmpc call returns the thread id across all teams. It's value is
 // cached by the compiler and used when calling the runtime. On nvptx
 // it's cheap to recalculate this value so we never use the result
 // of this call.
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
   int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   return GetOmpThreadId(tid, checkSPMDMode(loc));
 }

 ////////////////////////////////////////////////////////////////////////////////
 // push params
 ////////////////////////////////////////////////////////////////////////////////

 EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
                                     int32_t num_threads) {
   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
   tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
       num_threads;
 }

 // Do nothing. The host guarantees we started the requested number of
 // teams and we only need inspection of gridDim.

 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
                                   int32_t num_teams, int32_t thread_limit) {
   PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
   ASSERT0(LT_FUSSY, 0,
           "should never have anything with new teams on device");
 }

 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
                                   int proc_bind) {
   PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
 }

 #pragma omp end declare target
	//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Parallel implementation in the GPU. Here is the pattern:
	//
	// while (not finished) {
	//
	// if (master) {
	// sequential code, decide which par loop to do, or if finished
	// __kmpc_kernel_prepare_parallel() // exec by master only
	// }
	// syncthreads // A
	// __kmpc_kernel_parallel() // exec by all
	// if (this thread is included in the parallel) {
	// switch () for all parallel loops
	// __kmpc_kernel_end_parallel() // exec only by threads in parallel
	// }
	//
	//
	// The reason we don't exec end_parallel for the threads not included
	// in the parallel loop is that for each barrier in the parallel
	// region, these non-included threads will cycle through the
	// syncthread A. Thus they must preserve their current threadId that
	// is larger than thread in team.
	//
	// To make a long story short...
	//
	//===----------------------------------------------------------------------===//
	#pragma omp declare target

	#include "common/omptarget.h"
	#include "target_impl.h"

	////////////////////////////////////////////////////////////////////////////////
	// support for parallel that goes parallel (1 static level only)
	////////////////////////////////////////////////////////////////////////////////

	INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
	uint16_t NThreadsICV,
	uint16_t ThreadLimit) {
	uint16_t ThreadsRequested = NThreadsICV;
	if (NumThreadsClause != 0) {
	ThreadsRequested = NumThreadsClause;
	}

	uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
	if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
	ThreadsAvailable = ThreadLimit;
	}

	uint16_t NumThreads = ThreadsAvailable;
	if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
	NumThreads = ThreadsRequested;
	}

	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
	// On Volta and newer architectures we require that all lanes in
	// a warp participate in the parallel region. Round down to a
	// multiple of WARPSIZE since it is legal to do so in OpenMP.
	if (NumThreads < WARPSIZE) {
	NumThreads = 1;
	} else {
	NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
	}
	#endif

	return NumThreads;
	}

	// This routine is always called by the team master..
	EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
	PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");

	omptarget_nvptx_workFn = WorkFn;

	// This routine is only called by the team master. The team master is
	// the first thread of the last warp. It always has the logical thread
	// id of 0 (since it is a shadow for the first worker thread).
	const int threadId = 0;
	omptarget_nvptx_TaskDescr *currTaskDescr =
	omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
	ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
	ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
	"cannot be called in a parallel region.");
	if (currTaskDescr->InParallelRegion()) {
	PRINT0(LD_PAR, "already in parallel: go seq\n");
	return;
	}

	uint16_t &NumThreadsClause =
	omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);

	uint16_t NumThreads =
	determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);

	if (NumThreadsClause != 0) {
	// Reset request to avoid propagating to successive #parallel
	NumThreadsClause = 0;
	}

	ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
	(int)NumThreads);
	ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
	"only team master can create parallel");

	// Set number of threads on work descriptor.
	omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
	workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
	threadsInTeam = NumThreads;
	}

	// All workers call this function. Deactivate those not needed.
	// Fn - the outlined work function to execute.
	// returns True if this thread is active, else False.
	//
	// Only the worker threads call this routine.
	EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
	PRINT0(LD_IO \| LD_PAR, "call to __kmpc_kernel_parallel\n");

	// Work function and arguments for L1 parallel region.
	*WorkFn = omptarget_nvptx_workFn;

	// If this is the termination signal from the master, quit early.
	if (!*WorkFn) {
	PRINT0(LD_IO \| LD_PAR, "call to __kmpc_kernel_parallel finished\n");
	return false;
	}

	// Only the worker threads call this routine and the master warp
	// never arrives here. Therefore, use the nvptx thread id.
	int threadId = GetThreadIdInBlock();
	omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
	// Set to true for workers participating in the parallel region.
	bool isActive = false;
	// Initialize state for active threads.
	if (threadId < threadsInTeam) {
	// init work descriptor from workdesccr
	omptarget_nvptx_TaskDescr *newTaskDescr =
	omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
	ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
	newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
	// install new top descriptor
	omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
	newTaskDescr);
	// init private from int value
	PRINT(LD_PAR,
	"thread will execute parallel region with id %d in a team of "
	"%d threads\n",
	(int)newTaskDescr->ThreadId(), (int)nThreads);

	isActive = true;
	// Reconverge the threads at the end of the parallel region to correctly
	// handle parallel levels.
	// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
	// warp. If only 1 thread is active, not need to reconverge the threads.
	// If we have the whole warp, reconverge all the threads in the warp before
	// actually trying to change the parallel level. Otherwise, parallel level
	// can be changed incorrectly because of threads divergence.
	bool IsActiveParallelRegion = threadsInTeam != 1;
	IncParallelLevel(IsActiveParallelRegion,
	IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
	}

	return isActive;
	}

	EXTERN void __kmpc_kernel_end_parallel() {
	// pop stack
	PRINT0(LD_IO \| LD_PAR, "call to __kmpc_kernel_end_parallel\n");
	ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

	// Only the worker threads call this routine and the master warp
	// never arrives here. Therefore, use the nvptx thread id.
	int threadId = GetThreadIdInBlock();
	omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
	omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
	threadId, currTaskDescr->GetPrevTaskDescr());

	// Reconverge the threads at the end of the parallel region to correctly
	// handle parallel levels.
	// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
	// warp. If only 1 thread is active, not need to reconverge the threads.
	// If we have the whole warp, reconverge all the threads in the warp before
	// actually trying to change the parallel level. Otherwise, parallel level can
	// be changed incorrectly because of threads divergence.
	bool IsActiveParallelRegion = threadsInTeam != 1;
	DecParallelLevel(IsActiveParallelRegion,
	IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u);
	}

	////////////////////////////////////////////////////////////////////////////////
	// support for parallel that goes sequential
	////////////////////////////////////////////////////////////////////////////////

	EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
	PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");

	IncParallelLevel(/ActiveParallel=/false, __kmpc_impl_activemask());

	if (checkRuntimeUninitialized(loc)) {
	ASSERT0(LT_FUSSY, checkSPMDMode(loc),
	"Expected SPMD mode with uninitialized runtime.");
	return;
	}

	// assume this is only called for nested parallel
	int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));

	// unlike actual parallel, threads in the same team do not share
	// the workTaskDescr in this case and num threads is fixed to 1

	// get current task
	omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
	currTaskDescr->SaveLoopData();

	// allocate new task descriptor and copy value from current one, set prev to
	// it
	omptarget_nvptx_TaskDescr *newTaskDescr =
	(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
	"new seq parallel task");
	newTaskDescr->CopyParent(currTaskDescr);

	// tweak values for serialized parallel case:
	// - each thread becomes ID 0 in its serialized parallel, and
	// - there is only one thread per team
	newTaskDescr->ThreadId() = 0;

	// set new task descriptor as top
	omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
	newTaskDescr);
	}

	EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
	uint32_t global_tid) {
	PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");

	DecParallelLevel(/ActiveParallel=/false, __kmpc_impl_activemask());

	if (checkRuntimeUninitialized(loc)) {
	ASSERT0(LT_FUSSY, checkSPMDMode(loc),
	"Expected SPMD mode with uninitialized runtime.");
	return;
	}

	// pop stack
	int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
	omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
	// set new top
	omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
	threadId, currTaskDescr->GetPrevTaskDescr());
	// free
	SafeFree(currTaskDescr, "new seq parallel task");
	currTaskDescr = getMyTopTaskDescriptor(threadId);
	currTaskDescr->RestoreLoopData();
	}

	EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
	PRINT0(LD_IO, "call to __kmpc_parallel_level\n");

	return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
	}

	// This kmpc call returns the thread id across all teams. It's value is
	// cached by the compiler and used when calling the runtime. On nvptx
	// it's cheap to recalculate this value so we never use the result
	// of this call.
	EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
	int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
	return GetOmpThreadId(tid, checkSPMDMode(loc));
	}

	////////////////////////////////////////////////////////////////////////////////
	// push params
	////////////////////////////////////////////////////////////////////////////////

	EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
	int32_t num_threads) {
	PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
	ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
	tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
	omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
	num_threads;
	}

	// Do nothing. The host guarantees we started the requested number of
	// teams and we only need inspection of gridDim.

	EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
	int32_t num_teams, int32_t thread_limit) {
	PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
	ASSERT0(LT_FUSSY, 0,
	"should never have anything with new teams on device");
	}

	EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
	int proc_bind) {
	PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
	}

	#pragma omp end declare target