Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
128 commits
Select commit Hold shift + click to select a range
ce31566
Add first dummy version of NCCL backend
msimberg Nov 3, 2025
a4b913d
Clean up some unnecessary nccl files and try to port more mpi functio…
msimberg Nov 3, 2025
e14eeb3
Add todos
msimberg Nov 27, 2025
303ef54
Update nccl support
msimberg Dec 3, 2025
c01f1fe
Slightly more working nccl backend with events as requests and lots o…
msimberg Dec 18, 2025
11564e1
Enable one more nccl test
msimberg Dec 18, 2025
aca250b
Remove TODOs
msimberg Dec 19, 2025
ac03aaa
Add is_stream_aware, start_group, end_group to all backends
msimberg Dec 19, 2025
0331903
Clean up nccl event/request handling
msimberg Dec 19, 2025
3a4b321
Remove debugging print
msimberg Dec 19, 2025
325adf0
cleap
msimberg Dec 19, 2025
a724978
Clean up and disable some tests with NCCL
msimberg Dec 22, 2025
a13a3dd
Remove TODO
msimberg Dec 22, 2025
0c452c0
Add missing cuda_event.hpp file
msimberg Dec 22, 2025
162c745
Minor cleanup
msimberg Dec 22, 2025
15670aa
More cleanup
msimberg Dec 22, 2025
c1f70d0
Add missing stream argument
msimberg Jan 6, 2026
06d38ec
Add dummy stream parameter to libfabric and ucx backends
msimberg Jan 6, 2026
9653b59
Remove TODO from FindNCCL.cmake
msimberg Jan 6, 2026
3790aee
Remove TODO from test_locality.cpp
msimberg Jan 6, 2026
6e1c866
Add event pool and cached cuda event helper
msimberg Jan 7, 2026
9d38449
Remove duplicate key in clang-format config
msimberg Jan 8, 2026
46e0cf8
Format nccl files
msimberg Jan 8, 2026
b3359b3
Update src/nccl/nccl_communicator.hpp
msimberg Mar 20, 2026
4248180
Add missing FindNCCL.cmake for package install
msimberg Mar 19, 2026
2fabf67
Explicitly link against CUDA::cudart with NCCL
msimberg Mar 19, 2026
54fd8fe
Cleanup and check for device buffers with NCCL
msimberg Mar 20, 2026
b0a8228
Use plain throw to rethrow exception
msimberg Mar 20, 2026
4d60913
Handle shared/static NCCL library better
msimberg Mar 20, 2026
d68f893
Use [[maybe_unused]] for unused stream arguments
msimberg Mar 20, 2026
5ae81d4
Add helper to check if in active NCCL group
msimberg Mar 20, 2026
0c98a0e
Disallowing calling progress/wait while in an active NCCL group
msimberg Mar 20, 2026
a5bfaa8
Remove todo
msimberg Mar 20, 2026
c831f5d
Update readme
msimberg Mar 20, 2026
452074b
Add test for NCCL group semantics wrt wait/progress
msimberg Mar 23, 2026
0761f70
Fix nccl progress/wait check
msimberg Mar 23, 2026
fc5a14f
Clean up test_send_recv
msimberg Mar 23, 2026
e616b9f
Remove TODO about multiple nccl plugin inits (fixed in newer versions)
msimberg Mar 23, 2026
8cf11af
Remove extra parenthesis
msimberg Mar 23, 2026
3f4dc08
Make get_transport_option const
msimberg Mar 23, 2026
42f7163
Revert tag changes in test_send_multi
msimberg Mar 23, 2026
e41e726
Revert tag changes in test_locality.cpp
msimberg Mar 23, 2026
84cc2f4
Fix another get_transport_option
msimberg Mar 23, 2026
1dc6cfa
Remove old test
msimberg Mar 23, 2026
d3b6b96
Add notes about NCCL to readme
msimberg Mar 23, 2026
2121de0
Update test_send_recv for NCCL
msimberg Mar 23, 2026
6db091c
Remove device buffer check for NCCL communicator
msimberg Mar 23, 2026
edc6c32
Remove todo
msimberg Mar 23, 2026
11c6c4b
Expand readme with more nccl info
msimberg Mar 23, 2026
ab051d8
Clean up nccl backend
msimberg Mar 23, 2026
dff4b8f
Add missing const
msimberg Mar 23, 2026
2509800
Add missing find_dependency(NCCL) to oomphConfig.cmake.in
msimberg Mar 25, 2026
040514f
Add cicd-ext CI configuration
msimberg Mar 25, 2026
ba1784c
Apply suggestions from code review
msimberg Mar 25, 2026
8cf5076
Apply suggestion from @msimberg
msimberg Mar 25, 2026
5423771
Fix CI container build args
msimberg Mar 25, 2026
3f15f1f
Specify oomph@main in spack environments
msimberg Mar 25, 2026
b832fe6
Remove +python from spack specs
msimberg Mar 25, 2026
1e851df
Remove stages
msimberg Mar 25, 2026
0398d59
Refactor ci config
msimberg Mar 25, 2026
b1297ac
Fix base image
msimberg Mar 26, 2026
3484722
Fix typo
msimberg Mar 26, 2026
33624ef
Fix env file path
msimberg Mar 26, 2026
a3e950d
Update cmake config in CI
msimberg Mar 26, 2026
9871e88
Use NUM_PROCS instead of nproc
msimberg Mar 26, 2026
177592d
Fix num procs
msimberg Mar 26, 2026
110d4eb
Update test job config
msimberg Mar 26, 2026
9518354
Fix syntax
msimberg Mar 26, 2026
5419c5b
Fix parallel testing
msimberg Mar 26, 2026
ffd02a4
Explicitly ask for one gpu per task
msimberg Mar 26, 2026
a81f37f
Verbose ctest output
msimberg Mar 26, 2026
7b35569
Explicitly set debug build for CI
msimberg Mar 26, 2026
60e0e25
Don't set any mpiexec options if MPIEXEC_EXECUTABLE is empty
msimberg Mar 26, 2026
c3ea568
Don't buffer test output
msimberg Mar 26, 2026
521011e
Skip cancel test
msimberg Mar 26, 2026
abb4188
Fix slurm variables
msimberg Mar 26, 2026
3689ebe
Shorten timeouts
msimberg Mar 26, 2026
841d97b
Don't load cxi hooks in CI
msimberg Mar 26, 2026
d7995af
Update slurm and ctest options
msimberg Mar 26, 2026
c91ae1c
List libfabric and ucx info in CI
msimberg Apr 9, 2026
b74e96d
Clean up test templates
msimberg Apr 9, 2026
80ce06c
Disable NCCL CI pipelines since it's not yet supported
msimberg Apr 9, 2026
3fed378
Small cleanup and parallel non-distributed tests in CI
msimberg Apr 9, 2026
38112f2
strace ctest call
msimberg Apr 10, 2026
f1e54aa
Verbose CI tests
msimberg Apr 10, 2026
faf58b8
Remove verbose parallel tests in CI
msimberg Apr 10, 2026
e2e7d9c
Fix fortran parallel tests when MPIEXEC_EXECUTABLE is empty
msimberg Apr 10, 2026
d83dbaa
Add missing BACKEND build arg to build step in CI
msimberg Apr 10, 2026
a21ac78
Singular hour
msimberg Apr 10, 2026
d2ae1f0
Double word
msimberg Apr 10, 2026
bdd9374
Remove unnecessary extends
msimberg Apr 10, 2026
d89bf20
More curl flags
msimberg Apr 10, 2026
45d6992
Use prerelease base image in CI
msimberg Apr 13, 2026
aa0e244
Merge branch 'cicd-ext' into nccl-context
msimberg Apr 13, 2026
a26b443
Enable nccl testing in CI config
msimberg Apr 13, 2026
99adee5
Use spack-packages with oomph backend=nccl support
msimberg Apr 13, 2026
a510460
Use separate Testing directory for ctest per process
msimberg Apr 13, 2026
841b9bf
Use separate Testing directory for ctest per process
msimberg Apr 13, 2026
c80510f
Try to fix ctest wrapper
msimberg Apr 13, 2026
38d8819
Merge branch 'cicd-ext' into nccl-context
msimberg Apr 13, 2026
55d2529
Try to create testing directory first
msimberg Apr 13, 2026
b23c3e4
cd into build directory
msimberg Apr 13, 2026
5fe5a13
cd into build directory
msimberg Apr 13, 2026
e55f1ad
Fix testing path
msimberg Apr 13, 2026
f6b88d5
Merge branch 'cicd-ext' into nccl-context
msimberg Apr 13, 2026
2ce53ed
Try without --map-root-user
msimberg Apr 13, 2026
9c71102
Try something else for ctest deadlocks
msimberg Apr 13, 2026
5f3f6bb
Fix syntax error
msimberg Apr 13, 2026
e33fd3d
Add sleep just to be safe when symlinking testing directory
msimberg Apr 14, 2026
a00c372
Use public path for base images
msimberg Apr 14, 2026
07c0cf6
Merge branch 'cicd-ext' into nccl-context
msimberg Apr 15, 2026
f9b528d
Disable libfabric tests in cicd-ext
msimberg Apr 16, 2026
0a01346
Merge branch 'cicd-ext' into nccl-context
msimberg Apr 17, 2026
a62c496
Merge remote-tracking branch 'origin/main' into nccl-context
msimberg Apr 29, 2026
c7d7ebe
Disallow self-send/recv with NCCL
msimberg Jun 2, 2026
fa9e662
Minor simplification and copyright changes
msimberg Jun 2, 2026
cffef8d
Don't record event if no communication was done in nccl group
msimberg Jun 2, 2026
993c4c2
Yield when waiting for nccl init to finish
msimberg Jun 2, 2026
365fab9
Expose get_transport_option in communicator
msimberg Jun 2, 2026
be60797
Add missing file
msimberg Jun 3, 2026
bea94a8
Fix world -> MPI_COMM_WORLD
msimberg Jun 3, 2026
98603d9
Add missing alias
msimberg Jun 3, 2026
f35c43a
Reformat test helper
msimberg Jun 3, 2026
aad3727
Small refactor
msimberg Jun 3, 2026
130059d
Skip cancellation tests with gtest instead of early return
msimberg Jun 3, 2026
2d66b7d
Bump hwmalloc to latest master (c033330)
msimberg Jun 25, 2026
35811d3
Merge remote-tracking branch 'origin/main' into nccl-context
msimberg Jun 25, 2026
3d7f658
Enable self-send/recv within NCCL groups
msimberg Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 26 additions & 29 deletions .cscs-ci/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ include:
variables:
BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps4-dev
SPACK_SHA: v1.1.1
SPACK_PACKAGES_SHA: bc93746ce936d6653271b6e98f6df6ee28f64e84 # develop on 2026-03-25
SPACK_PACKAGES_SHA: 5f24787b5cd3c2356d9a8188b989ceb5307299c6 # https://github.com/msimberg/spack-packages/tree/oomph-nccl

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be changed to spack-packages develop before merging. Once approved, I'll get the nccl variant merged upstream and update this. For now this points to a branch.

FF_TIMESTAMPS: true

.build_deps_template:
Expand All @@ -25,13 +25,12 @@ variables:
reports:
dotenv: base-${BACKEND}.env

# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
# build_deps_nccl:
# variables:
# BACKEND: nccl
# extends:
# - .container-builder-cscs-gh200
# - .build_deps_template
build_deps_nccl:
variables:
BACKEND: nccl
extends:
- .container-builder-cscs-gh200
- .build_deps_template

build_deps_mpi:
variables:
Expand Down Expand Up @@ -69,14 +68,13 @@ build_deps_ucx:
reports:
dotenv: build-${BACKEND}.env

# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
# build_nccl:
# variables:
# BACKEND: nccl
# extends: .build_template
# needs:
# - job: build_deps_nccl
# artifacts: true
build_nccl:
variables:
BACKEND: nccl
extends: .build_template
needs:
- job: build_deps_nccl
artifacts: true

build_mpi:
variables:
Expand Down Expand Up @@ -136,20 +134,19 @@ build_ucx:
- sleep 1
- ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60

# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
# test_serial_nccl:
# extends: .test_serial_template
# needs:
# - job: build_nccl
# artifacts: true
# image: $BUILD_IMAGE
test_serial_nccl:
extends: .test_serial_template
needs:
- job: build_nccl
artifacts: true
image: $BUILD_IMAGE

# test_parallel_nccl:
# extends: .test_parallel_template
# needs:
# - job: build_nccl
# artifacts: true
# image: $BUILD_IMAGE
test_parallel_nccl:
extends: .test_parallel_template
needs:
- job: build_nccl
artifacts: true
image: $BUILD_IMAGE

test_serial_mpi:
extends: .test_serial_template
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ include(oomph_ucx)
# ---------------------------------------------------------------------
include(oomph_libfabric)

# ---------------------------------------------------------------------
# oomph NCCL variant
# ---------------------------------------------------------------------
include(oomph_nccl)

# ---------------------------------------------------------------------
# main src subdir
# ---------------------------------------------------------------------
Expand Down Expand Up @@ -142,6 +147,7 @@ install(
${CMAKE_CURRENT_BINARY_DIR}/oomphConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/oomphConfigVersion.cmake
${CMAKE_CURRENT_LIST_DIR}/cmake/FindLibfabric.cmake
${CMAKE_CURRENT_LIST_DIR}/cmake/FindNCCL.cmake
${CMAKE_CURRENT_LIST_DIR}/cmake/FindUCX.cmake
${CMAKE_CURRENT_LIST_DIR}/cmake/FindPMIx.cmake
DESTINATION
Expand Down
68 changes: 65 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
# OOMPH

**Oomph** is a library for enabling high performance point-to-point, asynchronous communication over
different fabrics. It leverages the ubiquitos MPI library as well as UCX and Libfabric. Both
device and host memory are supported. Under the hood it uses
[hwmalloc](https://github.com/ghex-org/hwmalloc) for memory registration.
different fabrics. It leverages the ubiquitous MPI library as well as UCX and Libfabric. Both
device and host memory are supported. A subset of functionality is also supported with NCCL. Under
the hood it uses [hwmalloc](https://github.com/ghex-org/hwmalloc) for memory registration.

**selling points**
- lightweight, fast
Expand Down Expand Up @@ -136,6 +136,68 @@ comm.progress();
// or progress until some event is triggered
while(!completed) { comm.progress(); }
```

### Groups

Communicators expose group functionality as provided by NCCL (with
ncclGroupStart and ncclGroupEnd). For non-NCCL backends the group functionality
is a no-op. For NCCL using the group functionality can be a both a requirement
to avoid deadlocks (communication within a group can make progress
independently, while outside of a group communication is ordered) and for
performance (a single device kernel is submitted for a NCCL group).

Groups are created by explicitly starting and ending the group:

```cpp
comm.start_group();
oomph::send_request sreq = comm.send(smsg, 1, 0);
oomph::recv_request rreq = comm.recv(rmsg, 1, 0);
comm.end_group();

// With NCCL, no progress will be made until after the group ends
sreq.wait();
rreq.wait();
```

### Stream awareness

Some backend implementations can schedule communication on a GPU stream.
Currently only the NCCL backend makes use of this. All other backends ignore
the stream argument. To query if a backend is stream-aware use the
`is_stream_aware` member query on a communicator. The stream can be passed as
an optional last parameter to `send` or `recv`:

```cpp
if (comm.is_stream_aware()) {
# Schedule communication on the default CUDA stream if the backend is
# stream aware
cudaStream_t stream = 0;
oomph::send_request comm.send(msg, 1, 0, stream);
}
```

### NCCL restrictions

NCCL has significantly different semantics from MPI, libfabric, and UCX which
is reflected in a number of restrictions on how the NCCL communicator can be
used:

- Tags are not supported by NCCL and ignored by the backend. Communication
order on different ranks must match (except within NCCL groups where there is
some flexibility). This also means that e.g. recv should not be called before
send unless within a NCCL group.
- The `thread_safe` option for the NCCL communicator is not supported because
of the above ordering restrictions.
- Cancellation is not supported.
- `wait` and `progress` are disallowed when a NCCL group is active as no
progress can be made until a NCCL group is ended and submitted.
- Send/recv to own rank is supported within NCCL groups. Outside of a group,
self-send/recv will throw an exception because NCCL's order-based matching
requires the atomic submission that groups provide.

The NCCL backend is primarily designed for use in GHEX where these differences
can be hidden from the user.

## Acknowledgments
This work was financially supported by the PRACE project funded in part by the EU's Horizon 2020
Research and Innovation programme (2014-2020) under grant agreement 823767.
78 changes: 78 additions & 0 deletions cmake/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# From https://github.com/pytorch/gloo/blob/main/cmake/Modules/Findnccl.cmake.

# Try to find NCCL
#
# The following variables are optionally searched for defaults
# NCCL_ROOT_DIR: Base directory where all NCCL components are found
# NCCL_INCLUDE_DIR: Directory where NCCL header is found
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR} CACHE PATH "Folder contains NVIDIA NCCL")

find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
HINTS
${NCCL_INCLUDE_DIR}
${NCCL_ROOT_DIR}
${NCCL_ROOT_DIR}/include
${CUDA_TOOLKIT_ROOT_DIR}/include)

if(DEFINED ENV{USE_STATIC_NCCL} AND NOT "$ENV{USE_STATIC_NCCL}" STREQUAL "")
message(STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
set(_use_static_nccl ON)
set(NCCL_LIBNAME "libnccl_static.a")
else()
set(_use_static_nccl OFF)
set(NCCL_LIBNAME "nccl")
endif()

find_library(NCCL_LIBRARY
NAMES ${NCCL_LIBNAME}
HINTS
${NCCL_LIB_DIR}
${NCCL_ROOT_DIR}
${NCCL_ROOT_DIR}/lib
${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
${NCCL_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib64)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY)

if (NCCL_FOUND)
set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/nccl.h")
message(STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
if (NCCL_MAJOR_VERSION_DEFINED)
string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
endif()
set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
set(NCCL_LIBRARIES ${NCCL_LIBRARY})
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)

if(NOT TARGET NCCL::nccl AND NCCL_FOUND)
if(_use_static_nccl)
add_library(NCCL::nccl STATIC IMPORTED)
else()
add_library(NCCL::nccl SHARED IMPORTED)
endif()
set_target_properties(NCCL::nccl PROPERTIES
IMPORTED_LOCATION ${NCCL_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${NCCL_INCLUDE_DIRS}
)
Comment thread
msimberg marked this conversation as resolved.
endif()
endif()

3 changes: 3 additions & 0 deletions cmake/oomphConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@ if (@OOMPH_WITH_LIBFABRIC@)
#set(LIBFABRIC_INCLUDE_DIR @ULIBFABRIC_INCLUDE_DIRS@)
find_dependency(Libfabric)
endif()
if (@OOMPH_WITH_NCCL@)
find_dependency(NCCL)
endif()
include(${CMAKE_CURRENT_LIST_DIR}/oomph-targets.cmake)

19 changes: 19 additions & 0 deletions cmake/oomph_nccl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# set all NCCL related options and values

#------------------------------------------------------------------------------
# Enable NCCL support
#------------------------------------------------------------------------------
set(OOMPH_WITH_NCCL OFF CACHE BOOL "Build with NCCL backend")

if (OOMPH_WITH_NCCL)
find_package(CUDAToolkit REQUIRED)
find_package(NCCL REQUIRED)
add_library(oomph_nccl SHARED)
add_library(oomph::nccl ALIAS oomph_nccl)
oomph_shared_lib_options(oomph_nccl)
target_link_libraries(oomph_nccl PUBLIC NCCL::nccl CUDA::cudart)
install(TARGETS oomph_nccl
EXPORT oomph-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
2 changes: 1 addition & 1 deletion ext/hwmalloc
Submodule hwmalloc updated 0 files
Loading
Loading