NvSciSyncFence *signalerFence = NULL;
NvSciSyncFence *waiterFence = NULL;
NvSciIpcEndpoint signalerIpcEndpoint = 0;
NvSciIpcEndpoint waiterIpcEndpoint = 0;
NvSciSyncAttrList unreconciledList[2] = {NULL};
NvSciSyncAttrList reconciledList = NULL;
NvSciSyncAttrList newConflictList = NULL;
NvSciSyncAttrList signalerAttrList = NULL;
NvSciSyncAttrList waiterAttrList = NULL;
NvSciSyncAttrList importedWaiterAttrList = NULL;
NvSciSyncObjIpcExportDescriptor objDesc;
NvSciSyncFenceIpcExportDescriptor fenceDesc;
NvSciSyncObj signalObj;
NvSciSyncObj waitObj;
NvSciSyncModule module = NULL;
void* objAndList;
size_t objAndListSize = 0;
void* waiterListDesc;
size_t waiterAttrListSize = 0;
CUcontext signalerCtx = 0;
CUcontext waiterCtx = 0;
int iGPU = 0;
int dGPU = 1;
cudaStream_t signalerCudaStream;
cudaStream_t waiterCudaStream;
cudaExternalSemaphore_t signalerSema, waiterSema;
cudaExternalSemaphoreHandleDesc semaDesc;
cudaExternalSemaphoreSignalParams sigParams;
cudaExternalSemaphoreWaitParams waitParams;
/*****************INIT PHASE**************************/
err = NvSciSyncModuleOpen(&module);
err = NvSciIpcInit();
err = NvSciIpcOpenEndpoint("ipc_test", &signalerIpcEndpoint);
err = NvSciIpcOpenEndpoint("ipc_test", &waiterIpcEndpoint);
err = NvSciSyncAttrListCreate(module, &signalerAttrList);
err = NvSciSyncAttrListCreate(module, &waiterAttrList);
signalerFence = (NvSciSyncFence *)calloc(1, sizeof(*signalerFence));
waiterFence = (NvSciSyncFence *)calloc(1, sizeof(*waiterFence));
cudaFree(0);
cudaSetDevice(iGPU);// Signaler will be on Device-1/iGPU
cuCtxCreate(&signalerCtx, CU_CTX_MAP_HOST, iGPU);
cudaSetDevice(dGPU);// Waiter will be on Device-0/dGPU
cuCtxCreate(&waiterCtx, CU_CTX_MAP_HOST, dGPU);
cuCtxPushCurrent(signalerCtx);
cudaStreamCreate(&signalerCudaStream);
cuCtxPopCurrent(&signalerCtx);
cuCtxPushCurrent(waiterCtx);
cudaStreamCreate(&waiterCudaStream);
cuCtxPopCurrent(&waiterCtx);
cuCtxPushCurrent(waiterCtx);
cudaDeviceGetNvSciSyncAttributes(waiterAttrList, dGPU, cudaNvSciSyncAttrWait);
err = NvSciSyncAttrListIpcExportUnreconciled(&waiterAttrList, 1, waiterIpcEndpoint, &waiterListDesc, &waiterAttrListSize);
// Allocate cuda memory for the signaler, if needed
cuCtxPopCurrent(&waiterCtx);
cuCtxPushCurrent(signalerCtx);
cudaDeviceGetNvSciSyncAttributes(signalerAttrList, iGPU, cudaNvSciSyncAttrSignal);
// Allocate cuda memory for the waiter, if needed
err = NvSciSyncAttrListIpcImportUnreconciled(module, signalerIpcEndpoint, waiterListDesc, waiterAttrListSize, &importedWaiterAttrList);
cuCtxPopCurrent(&signalerCtx);
unreconciledList[0] = signalerAttrList;
unreconciledList[1] = importedWaiterAttrList;
err = NvSciSyncAttrListReconcile(unreconciledList, 2, &reconciledList, &newConflictList);
err = NvSciSyncObjAlloc(reconciledList, &signalObj);
// Export Created NvSciSyncObj and attribute list to waiter
err = NvSciSyncIpcExportAttrListAndObj(signalObj, NvSciSyncAccessPerm_WaitOnly, signalerIpcEndpoint, &objAndList, &objAndListSize);
// Import already created NvSciSyncObj into a new NvSciSyncObj
err = NvSciSyncIpcImportAttrListAndObj(module, waiterIpcEndpoint, objAndList, objAndListSize, &waiterAttrList, 1, NvSciSyncAccessPerm_WaitOnly, 1000000, &waitObj);
cuCtxPushCurrent(signalerCtx);
semaDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync;
semaDesc.handle.nvSciSyncObj = (void*)signalObj;
cudaImportExternalSemaphore(&signalerSema, &semaDesc);
cuCtxPopCurrent(&signalerCtx);
cuCtxPushCurrent(waiterCtx);
semaDesc.type = cudaExternalSemaphoreHandleTypeNvSciSync;
semaDesc.handle.nvSciSyncObj = (void*)waitObj;
cudaImportExternalSemaphore(&waiterSema, &semaDesc);
cuCtxPopCurrent(&waiterCtx);
/**********************************************************/
/*****************STREAMING PHASE**************************/
cuCtxPushCurrent(signalerCtx);
sigParams.params.nvSciSync.fence = (void*)signalerFence;
sigParams.flags = 0; //Set flags = cudaExternalSemaphoreSignalSkipNvSciBufMemSync if needed
// LAUNCH CUDA WORK ON signalerCudaStream
cudaSignalExternalSemaphoresAsync(&signalerSema, &sigParams, 1, signalerCudaStream);
err = NvSciSyncIpcExportFence(signalerFence, signalerIpcEndpoint, &fenceDesc);
NvSciSyncFenceClear(signalerFence);
cuCtxPopCurrent(&signalerCtx);
cuCtxPushCurrent(waiterCtx);
err = NvSciSyncIpcImportFence(waitObj, &fenceDesc, waiterFence);
waitParams.params.nvSciSync.fence = (void*)waiterFence;
waitParams.flags = 0; //Set flags = cudaExternalSemaphoreWaitSkipNvSciBufMemSync if needed
cudaWaitExternalSemaphoresAsync(&waiterSema, &waitParams, 1, waiterCudaStream);
// LAUNCH CUDA WORK ON waiterCudaStream
cudaStreamSynchronize(waiterCudaStream);
cuCtxPopCurrent(&waiterCtx);
/**********************************************************/
/*****************TEAR-DOWN PHASE**************************/
NvSciSyncObjFree(signalObj);
NvSciSyncObjFree(waitObj);
NvSciSyncAttrListFree(reconciledList);
NvSciSyncAttrListFree(newConflictList);
NvSciSyncAttrListFree(signalerAttrList);
NvSciSyncAttrListFree(waiterAttrList);
NvSciSyncAttrListFree(importedWaiterAttrList);
NvSciSyncModuleClose(module);
NvSciIpcCloseEndpoint(signalerIpcEndpoint);
NvSciIpcCloseEndpoint(waiterIpcEndpoint);
cudaStreamSynchronize(signalerCudaStream);
cudaStreamSynchronize(waiterCudaStream);
cudaStreamDestroy(waiterCudaStream);
cudaStreamDestroy(signalerCudaStream);
cudaDestroyExternalSemaphore(signalerSema);
cudaDestroyExternalSemaphore(waiterSema);
cuCtxDestroy(signalerCtx);
cuCtxDestroy(waiterCtx);
free(signalerFence);
free(waiterFence);
/**********************************************************/