Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e73577c
Split send/recv test into independent cpu and device mode tests
biddisco Apr 17, 2026
2c03730
Bump hwmalloc submodule
biddisco Apr 16, 2026
e1d177b
Clang-format files affected by libfabric changes
biddisco Nov 13, 2025
e171551
disable clang-format for cmake generated code, fix missing include
biddisco Jul 7, 2025
66f7fd6
Add shm provider support to libfabric transport layer
biddisco Jul 7, 2025
2a4ba06
Remove unused includes and fix warnings in libfabric backend
biddisco Jul 7, 2025
5961661
Remove ipaddress locality functions and instead use AV fi_to_str
biddisco Jul 7, 2025
24e113e
Add LNX provider, simplify provider #ifdefs and fabric hints/info setup
biddisco Jul 7, 2025
a1f3fa6
Use thread mask (instead of boost::physical_concurrency) for num thre…
biddisco Jul 9, 2025
ea38cbf
Fix cxi initialization, some hints must be set before fi_info becomes…
biddisco Jul 9, 2025
f668ee7
Disable debug messages
biddisco Jul 9, 2025
e26f4bd
Clean up debug: namespace usage and rename ptr to (hex) hptr
biddisco Jul 9, 2025
526088c
Use safe fi_tostr_r and a std::array buffer in place of fi_tostr
biddisco Jul 10, 2025
9c7c6df
Fixes to support new hwmalloc API
biddisco Nov 13, 2025
2764787
ifdefs for LNX provider, especially address unsupported address-strin…
biddisco Nov 13, 2025
fe41c22
Fix an API change introduce from libfabric 1.20
biddisco Nov 13, 2025
ad198ec
Replace strcpy with strncpy
biddisco Nov 13, 2025
80f8108
Fix CI build fails due to unsupported older libfabric version
biddisco Nov 13, 2025
f246865
Use libfatbat mini libfabric repo for base libfabric layer
biddisco Apr 17, 2026
e002c84
Make sure logging is initialized
biddisco Apr 18, 2026
798a9ee
Remove duplicated functions from base controller
biddisco Apr 18, 2026
45f5b84
reuse controller base polling
biddisco Apr 18, 2026
fe8884c
Revert "Bump hwmalloc submodule"
biddisco Apr 20, 2026
764f439
Disable fortran temporarily for ci/cd
biddisco Apr 20, 2026
b6a05eb
Use macros for log creation to avoid instantiating void types when di…
biddisco Apr 20, 2026
bf65724
Use std::hardware_concurrency in place of boost::physical_concurrency
biddisco Apr 21, 2026
9b62a2c
Add fmt FetchContent to remove dependency and allow CI/CD to build co…
biddisco Apr 20, 2026
611b634
Cleanup dependencies
biddisco Apr 21, 2026
8f23d48
Switch libfabric install to newer source compiled version
biddisco Apr 21, 2026
47c6bcc
Enable libfabric testing on alps machines
biddisco Apr 21, 2026
76f5b6e
Add a var for PMI library to help with libfabric find
biddisco Apr 21, 2026
86b3ef1
Fix a debug format error and MPI_Comm type format for mpich/openmpi
biddisco Apr 17, 2026
b18c4f6
Enable CI/CD libfabric testing and set env var to disable MR Cache
biddisco Apr 22, 2026
de07829
experimenting with FI_MR_xxx for CI/CD
biddisco Apr 22, 2026
91ec19d
Apply a mem reg shutdown patch to the libfabric code for CI testing
biddisco Apr 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .cscs-ci/container/deps.Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@ RUN mkdir -p /opt/spack-packages && \
RUN spack repo remove --scope defaults:base builtin && \
spack repo add --scope site /opt/spack-packages/repos/spack_repo/builtin


RUN cat <<EOF > /opt/spack-packages/repos/spack_repo/builtin/packages/libfabric/mr_unsubscribe.patch
diff --git a/prov/util/src/import_mem_monitor.c b/prov/util/src/import_mem_monitor.c
index e7be581526f0..1f1f4a971099 100644
--- a/prov/util/src/import_mem_monitor.c
+++ b/prov/util/src/import_mem_monitor.c
@@ -111,7 +111,7 @@ static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier,
const void *addr, size_t len,
union ofi_mr_hmem_info *hmem_info)
{
- assert(impmon.impfid);
+ if (!impmon.impfid) return;
impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len);
}
EOF

RUN sed -i '/patch("nvhpc-symver.patch", when="@1.6.0:1.14.0 %nvhpc")/a\ patch("mr_unsubscribe.patch")' /opt/spack-packages/repos/spack_repo/builtin/packages/libfabric/package.py


ARG SPACK_ENV_FILE
COPY $SPACK_ENV_FILE /spack_environment/spack.yaml

Expand Down
94 changes: 53 additions & 41 deletions .cscs-ci/default.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- remote: "https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml"

variables:
BASE_IMAGE: jfrog.svc.cscs.ch/docker-group-csstaff/alps-images/ngc-pytorch:26.01-py3-alps4-dev
Expand All @@ -10,16 +10,22 @@ variables:
.build_deps_template:
timeout: 1 hour
before_script:
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true
- export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile | head -c 16`
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME
--password-stdin || true
- export DOCKERFILE_SHA=`sha256sum .cscs-ci/container/deps.Containerfile |
head -c 16`
- export ENV_FILE_SHA=`sha256sum ${SPACK_ENV_FILE} | head -c 16`
- export CONFIG_TAG=`echo $DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA | sha256sum - | head -c 16`
- export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-spack-deps-$BACKEND:$CONFIG_TAG
- export CONFIG_TAG=`echo
$DOCKERFILE_SHA-$BASE_IMAGE-$SPACK_SHA-$SPACK_PACKAGES_SHA-$ENV_FILE_SHA |
sha256sum - | head -c 16`
- export
PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-spack-deps-$BACKEND:$CONFIG_TAG
- echo -e "CONFIG_TAG=$CONFIG_TAG" >> base-${BACKEND}.env
- echo -e "DEPS_IMAGE=$PERSIST_IMAGE_NAME" >> base-${BACKEND}.env
variables:
DOCKERFILE: .cscs-ci/container/deps.Containerfile
DOCKER_BUILD_ARGS: '["BASE_IMAGE", "SPACK_SHA", "SPACK_PACKAGES_SHA", "SPACK_ENV_FILE"]'
DOCKER_BUILD_ARGS: "[\"BASE_IMAGE\", \"SPACK_SHA\", \"SPACK_PACKAGES_SHA\",
\"SPACK_ENV_FILE\"]"
SPACK_ENV_FILE: .cscs-ci/spack/$BACKEND.yaml
artifacts:
reports:
Expand Down Expand Up @@ -47,24 +53,25 @@ build_deps_ucx:
- .container-builder-cscs-gh200
- .build_deps_template

# TODO: Libfabric tests are currently failing on Alps and need to be fixed.
# build_deps_libfabric:
# variables:
# BACKEND: libfabric
# extends:
# - .container-builder-cscs-gh200
# - .build_deps_template
build_deps_libfabric:
variables:
BACKEND: libfabric
extends:
- .container-builder-cscs-gh200
- .build_deps_template

.build_template:
extends: .container-builder-cscs-gh200
timeout: 15 minutes
before_script:
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin || true
- export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-build-$BACKEND:$CI_COMMIT_SHA
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME
--password-stdin || true
- export
PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/oomph-build-$BACKEND:$CI_COMMIT_SHA
- echo -e "BUILD_IMAGE=$PERSIST_IMAGE_NAME" >> build-${BACKEND}.env
variables:
DOCKERFILE: .cscs-ci/container/build.Containerfile
DOCKER_BUILD_ARGS: '["DEPS_IMAGE", "BACKEND"]'
DOCKER_BUILD_ARGS: "[\"DEPS_IMAGE\", \"BACKEND\"]"
artifacts:
reports:
dotenv: build-${BACKEND}.env
Expand Down Expand Up @@ -94,21 +101,20 @@ build_ucx:
- job: build_deps_ucx
artifacts: true

# TODO: Libfabric tests are currently failing on Alps and need to be fixed.
# build_libfabric:
# variables:
# BACKEND: libfabric
# extends: .build_template
# needs:
# - job: build_deps_libfabric
# artifacts: true
build_libfabric:
variables:
BACKEND: libfabric
extends: .build_template
needs:
- job: build_deps_libfabric
artifacts: true

.test_template_base:
extends: .container-runner-clariden-gh200
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_GPUS_PER_TASK: 1
SLURM_TIMELIMIT: '5:00'
SLURM_TIMELIMIT: "5:00"
SLURM_PARTITION: normal
SLURM_MPI_TYPE: pmix
SLURM_NETWORK: disable_rdzv_get
Expand All @@ -123,7 +129,8 @@ build_ucx:
variables:
SLURM_NTASKS: 1
script:
- ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60 --parallel 8
- ctest --test-dir /oomph/build -L "serial" --output-on-failure --timeout 60
--parallel 8

.test_parallel_template:
extends: .test_template_base
Expand All @@ -132,9 +139,11 @@ build_ucx:
script:
# All ranks write to ctest files in Testing, but this can deadlock when
# writing inside the container.
- if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir /tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi
- if [[ "${SLURM_PROCID}" == 0 ]]; then rm -rf /oomph/build/Testing; mkdir
/tmp/Testing; ln -s /tmp/Testing /oomph/build/Testing; fi
- sleep 1
- ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure --timeout 60
- ctest --test-dir /oomph/build -L "parallel-ranks-4" --output-on-failure
--timeout 60

# TODO: NCCL will be enabled in https://github.com/ghex-org/oomph/pull/55
# test_serial_nccl:
Expand Down Expand Up @@ -179,17 +188,20 @@ test_parallel_ucx:
artifacts: true
image: $BUILD_IMAGE

# TODO: Libfabric tests are currently failing on Alps and need to be fixed.
# test_serial_libfabric:
# extends: .test_serial_template
# needs:
# - job: build_libfabric
# artifacts: true
# image: $BUILD_IMAGE
test_serial_libfabric:
extends: .test_serial_template
variables:
FI_MR_CACHE_MAX_COUNT: 0 # Disables the MR cache to prevent the teardown segfault
needs:
- job: build_libfabric
artifacts: true
image: $BUILD_IMAGE

# test_parallel_libfabric:
# extends: .test_parallel_template
# needs:
# - job: build_libfabric
# artifacts: true
# image: $BUILD_IMAGE
test_parallel_libfabric:
extends: .test_parallel_template
# variables:
# FI_MR_CACHE_MAX_COUNT: 0 # Disables the MR cache to prevent the teardown segfault
needs:
- job: build_libfabric
artifacts: true
image: $BUILD_IMAGE
13 changes: 13 additions & 0 deletions .cscs-ci/spack/mr_unsubscribe.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
diff --git a/prov/util/src/import_mem_monitor.c b/prov/util/src/import_mem_monitor.c
index e7be581526f0..1f1f4a971099 100644
--- a/prov/util/src/import_mem_monitor.c
+++ b/prov/util/src/import_mem_monitor.c
@@ -111,7 +111,7 @@ static void ofi_import_monitor_unsubscribe(struct ofi_mem_monitor *notifier,
const void *addr, size_t len,
union ofi_mr_hmem_info *hmem_info)
{
- assert(impmon.impfid);
+ if (!impmon.impfid) return;
impmon.impfid->export_ops->unsubscribe(impmon.impfid, addr, len);
}

11 changes: 9 additions & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ jobs:
- name: install libfabric
run: |
apt update
apt-get -y install libfabric-dev
git clone https://github.com/ofiwg/libfabric
cd libfabric
./autogen.sh
./configure --enable-debug --enable-tcp=yes --enable-lnx=no --enable-rxm=yes --enable-shm=yes --enable-cxi=no --enable-efa=no --enable-mrail=no --enable-opx=no --enable-psm2=no --enable-sm2=no --enable-psm3=no --enable-rxd=no --enable-sockets=no --enable-ucx=no --enable-udp=no --enable-usnic=no --enable-verbs=no --enable-xpmem=no
make -j install
# apt-get -y install libfabric-dev
- uses: actions/checkout@v3
with:
submodules: recursive
Expand Down Expand Up @@ -48,7 +53,9 @@ jobs:
- name: Build
run: cmake --build build --parallel 2
- name: Execute tests
run: cd build && export SHLVL=1 && export OMPI_ALLOW_RUN_AS_ROOT=1 && export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 && export CTEST_OUTPUT_ON_FAILURE=1 && env && ctest
run: cd build && export SHLVL=1 && export OMPI_ALLOW_RUN_AS_ROOT=1 && export
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 && export CTEST_OUTPUT_ON_FAILURE=1
&& env && ctest

build-gpu-hip:
runs-on: ubuntu-latest
Expand Down
13 changes: 4 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
cmake_minimum_required(VERSION 3.17)
# CMake version is set at 3.17 because of find_package(CUDAToolkit)

if (NOT ${CMAKE_VERSION} VERSION_LESS 3.27)
# new in 3.27: additionally use uppercase <PACKAGENAME>_ROOT
# environment and CMake variables for find_package
cmake_policy(SET CMP0144 NEW)
endif()

set(OOMPH_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
list(APPEND CMAKE_MODULE_PATH "${OOMPH_MODULE_PATH}")

Expand All @@ -28,6 +22,7 @@ endfunction()

set_policy(CMP0074 NEW) # find_package uses XXX_ROOT vars using PackageName
set_policy(CMP0144 NEW) # find_package allows XXX_ROOT vars using PACKAGENAME Uppercase
set_policy(CMP0167 NEW) # find_package uses new boost config (boost 1.70 onwards)

# ---------------------------------------------------------------------
# CMake setup, C++ version, build type, modules, etc
Expand Down Expand Up @@ -92,14 +87,14 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/include/oomph/config.hpp @ONLY)
install(FILES ${PROJECT_BINARY_DIR}/include/oomph/config.hpp
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/oomph)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_config.inc.in
${CMAKE_CURRENT_BINARY_DIR}/include/oomph/cmake_config.inc)

# ---------------------------------------------------------------------
# fortan bindings
# ---------------------------------------------------------------------
include(oomph_fortran)
add_subdirectory(bindings)
#include(oomph_fortran)
#add_subdirectory(bindings)

# ---------------------------------------------------------------------
# testing
Expand Down
31 changes: 16 additions & 15 deletions cmake/FindPMIx.cmake
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
find_package(PkgConfig QUIET)
pkg_check_modules(PC_PMIX QUIET pmix)

find_path(PMIX_INCLUDE_DIR pmix.h
HINTS
${PMIX_ROOT} ENV PMIX_ROOT
${PMIX_DIR} ENV PMIX_DIR
PATH_SUFFIXES include)
find_path(PMIX_INCLUDE_DIR pmix.h HINTS ${PMIX_ROOT} ENV PMIX_ROOT ${PMIX_DIR}
ENV PMIX_DIR PATH_SUFFIXES include
)

find_library(PMIX_LIBRARY HINT ${PMIX_DIR} NAMES pmix
HINTS
${PMIX_ROOT} ENV PMIX_ROOT
${PMIX_DIR} ENV PMIX_DIR
PATH_SUFFIXES lib lib64)
find_library(
PMIX_LIBRARY HINT ${PMIX_DIR} NAMES pmix HINTS ${PMIX_ROOT} ENV PMIX_ROOT
${PMIX_DIR} ENV PMIX_DIR
PATH_SUFFIXES lib lib64
)

set(PMIX_LIBRARIES ${PMIX_LIBRARY} CACHE INTERNAL "")
set(PMIX_LIBRARIES ${PMIX_LIBRARY} CACHE INTERNAL "")
set(PMIX_INCLUDE_DIRS ${PMIX_INCLUDE_DIR} CACHE INTERNAL "")

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(PMIx DEFAULT_MSG PMIX_LIBRARY PMIX_INCLUDE_DIR)
find_package_handle_standard_args(
PMIx DEFAULT_MSG PMIX_LIBRARY PMIX_INCLUDE_DIR
)

mark_as_advanced(PMIX_ROOT PMIX_LIBRARY PMIX_INCLUDE_DIR)

if(NOT TARGET PMIx::libpmix AND PMIx_FOUND)
add_library(PMIx::libpmix SHARED IMPORTED)
set_target_properties(PMIx::libpmix PROPERTIES
IMPORTED_LOCATION ${PMIX_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${PMIX_INCLUDE_DIR}
set_target_properties(
PMIx::libpmix PROPERTIES IMPORTED_LOCATION ${PMIX_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${PMIX_INCLUDE_DIR}
)
set(PMI_LIBRARY_TARGET PMIx::libpmix)
endif()
3 changes: 3 additions & 0 deletions cmake/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@

#cmakedefine01 OOMPH_USE_FAST_PIMPL
#cmakedefine01 OOMPH_ENABLE_BARRIER

// clang-format off
#define OOMPH_RECURSION_DEPTH @OOMPH_RECURSION_DEPTH@

#define OOMPH_VERSION @OOMPH_VERSION_NUMERIC@
#define OOMPH_VERSION_MAJOR @OOMPH_VERSION_MAJOR@
#define OOMPH_VERSION_MINOR @OOMPH_VERSION_MINOR@
#define OOMPH_VERSION_PATCH @OOMPH_VERSION_PATCH@
// clang-format on
2 changes: 2 additions & 0 deletions cmake/oomph_defs.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ namespace oomph
{
namespace fort
{
// clang-format off
using fp_type = @OOMPH_FORTRAN_FP@;
// clang-format on
typedef enum {
OomphBarrierGlobal=1,
OomphBarrierThread=2,
Expand Down
Loading
Loading