From f6b9ca7dfe088db3c1531b3364db7fbc4e3bc4c2 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 9 Apr 2026 08:00:31 +0200 Subject: [PATCH 01/10] SRE-3704 ci: Fault injection testing stage on VM/bare metal unitTestPost() already processes nlt-junit.xml via the testResults parameter it receives. The bare 'junit testResults: nlt-junit.xml' call that follows is redundant and has no failure protection: it uses the default healthScaleFactor so when fault injection tests intentionally produce failures in nlt-junit.xml it marks the build FAILURE immediately, overriding the controlled result handling done by unitTestPost(). When node_local_test.py runs with --no-root, DAOS logs are written to /localhome/jenkins/build/nlt_logs/ instead of /tmp/. The existing rsync only fetches from /tmp/, leaving nlt_logs/ empty and causing: No artifacts found that match the file pattern "nlt_logs/". Configuration error? Add a second rsync from build/nlt_logs/ to collect logs from the --no-root code path. The '|| true' ensures non-fatal behavior when the path does not exist (plain NLT runs without --no-root). Jenkinsfile: simplify NLT fault injection recordIssues call The vm_test/nlt-errors.json issue scanning for the 'NLT Fault injection testing' stage is now handled by unitTestPost() in pipeline-lib, so remove it from the explicit recordIssues call here. fault_status falback only based on PATH - Add fallback `fault_status` detection: if the primary detection via `$PREFIX/bin` fails, try resolving `fault_status` via `$PATH`, improving robustness when the binary is installed via RPM rather than built in-tree. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_server.yaml mprotect-based Argobots ULT stack overflow checking causes a TLB shootdown IPI on every stack allocation/deallocation. On KVM hosts running multiple VMs in parallel this results in VM exits across all vCPUs, significantly increasing latency under concurrent load. Remove the setting to use the default (no overflow check), which is acceptable for a CI/test environment where crashes are already caught by the test harness. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true ci: explicitly pass NLT/FI parameters to unitTest and unitTestPost pipeline-lib now supports overriding NLT/FI defaults (always_script, testResults, valgrind_pattern, with_valgrind, NLT, FI) via the config map, taking priority over the values auto-detected from the stage name by parseStageInfo. Make the Jenkinsfile stages explicit to take advantage of this and to make the stage configuration self-documenting. NLT stage (unitTest call): - Add with_valgrind: 'memcheck', valgrind_pattern: '*memcheck.xml', always_script: 'ci/unit/test_nlt_post.sh', testResults: 'nlt-junit.xml' NLT stage (unitTestPost call): - Remove always_script (now passed to unitTest above) - Add NLT: true to explicitly activate the NLT post-processing block (recordIssues, discoverGitReferenceBuild) instead of relying on stage name detection - Add valgrind_pattern: '*memcheck.xml' for the valgrind_stash NLT Fault injection testing stage (unitTest call): - Add always_script: 'ci/unit/test_nlt_post.sh', testResults: 'nlt-junit.xml' - Add with_valgrind: '' to explicitly suppress valgrind for FI NLT Fault injection testing stage (unitTestPost call): - Replace always_script with FI: true to explicitly activate fault injection post-processing (nlt-client-leaks.json, 'Fault injection' naming, discoverGitReferenceBuild) instead of relying on the now- removed stage name auto-detection of FI in parseStageInfo Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 134 ++++++++-------------- ci/docker_nlt.sh | 42 ------- ci/unit/test_nlt.sh | 7 +- ci/unit/test_nlt_node.sh | 12 +- ci/unit/test_nlt_post.sh | 8 +- src/tests/ftest/cart/util/cart_logtest.py | 5 - utils/nlt_server.yaml | 1 - utils/node_local_test.py | 15 +-- 8 files changed, 73 insertions(+), 151 deletions(-) delete mode 100755 ci/docker_nlt.sh diff --git a/Jenkinsfile b/Jenkinsfile index 7af1eac49be..5ce89f832d1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -39,42 +39,6 @@ void job_step_update(def value=currentBuild.currentResult) { jobStatusUpdate(job_status_internal, env.STAGE_NAME, value) } -Map nlt_test() { - // groovylint-disable-next-line NoJavaUtilDate - Date startDate = new Date() - try { - unstash('nltr') - } catch (e) { - print 'Unstash failed, results from NLT stage will not be included' - } - sh label: 'Fault injection testing using NLT', - script: './ci/docker_nlt.sh --class-name el8.fault-injection fi' - List filesList = [] - filesList.addAll(findFiles(glob: '*.memcheck.xml')) - int vgfail = 0 - int vgerr = 0 - if (filesList) { - String rcs = sh label: 'Check for Valgrind errors', - script: "grep -E ')' ${filesList.join(' ')} || true", - returnStdout: true - if (rcs) { - vfail = 1 - } - String suite = sanitizedStageName() - junitSimpleReport suite: suite, - file: suite + '_valgrind_results.xml', - fails: vgfail, - errors: vgerr, - name: 'Valgrind_Memcheck', - class: 'Valgrind', - message: 'Valgrind Memcheck error detected', - testdata: rcs - } - int runTime = durationSeconds(startDate) - Map runData = ['nlttest_time': runTime] - return runData -} - // Don't define this as a type or it loses it's global scope target_branch = env.CHANGE_TARGET ? env.CHANGE_TARGET : env.BRANCH_NAME String sanitized_JOB_NAME() { @@ -345,8 +309,11 @@ pipeline { defaultValue: 'ci_vm9', description: 'Label to use for 9 VM functional tests') string(name: 'CI_NLT_1_LABEL', - defaultValue: 'ci_nlt_1', + defaultValue: 'ci_nlt_vm1', description: 'Label to use for NLT tests') + string(name: 'CI_FI_1_LABEL', + defaultValue: 'ci_fi_vm1', + description: 'Label to use for Fault Injection (FI) tests') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL', defaultValue: 'ci_nvme5', description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages') @@ -798,7 +765,7 @@ pipeline { } } } - stage('NLT on EL 8.8') { + stage('NLT') { when { beforeAgent true expression { params.CI_NLT_TEST && !skipStage() } @@ -809,11 +776,23 @@ pipeline { steps { job_step_update( unitTest(timeout_time: 60, - inst_repos: prRepos(), - test_script: 'ci/unit/test_nlt.sh', + inst_repos: daosRepos(), + test_script: 'ci/unit/test_nlt.sh' + + ' --system-ram-reserved 4' + + ' --max-log-size 1950MiB' + + ' --dfuse-dir /localhome/jenkins/' + + ' --log-usage-save nltir.xml' + + ' --log-usage-export nltr.json' + + ' --class-name nlt all', + with_valgrind: 'memcheck', + valgrind_pattern: '*memcheck.xml', + always_script: 'ci/unit/test_nlt_post.sh', + testResults: 'nlt-junit.xml', unstash_opt: true, unstash_tests: false, - inst_rpms: unitPackages())) + inst_rpms: unitPackages(target: 'el9'), + image_version: 'el9.7', + prov_env_vars: 'VM_CPUS=14')) // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']], // skipPublishingChecks: true, // id: 'tlc', name: 'Fault Injection Interim Report') @@ -823,9 +802,10 @@ pipeline { always { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', - always_script: 'ci/unit/test_nlt_post.sh', referenceJobName: 'daos-stack/daos/release%252F2.6', - valgrind_stash: 'el8-gcc-nlt-memcheck' + valgrind_stash: 'nlt-memcheck', + valgrind_pattern: '*memcheck.xml', + NLT: true recordIssues enabledForFailure: true, failOnError: false, ignoreQualityGate: true, @@ -1010,62 +990,47 @@ pipeline { } } // post } // stage('Functional on Ubuntu 20.04') - stage('Fault injection testing on EL 8.8') { + stage('Fault injection testing') { when { beforeAgent true expression { !skipStage() } } agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.8' - label 'docker_runner_fi' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - parallel_build: true, - deps_build: true) - args '--tmpfs /mnt/daos_0' - } + label params.CI_FI_1_LABEL } steps { job_step_update( - sconsBuild(parallel_build: true, - scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug', - build_deps: 'no')) - job_step_update(nlt_test()) - // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']], - // skipPublishingChecks: true, - // id: 'fir', name: 'Fault Injection Report') + unitTest(timeout_time: 240, + inst_repos: daosRepos(), + test_script: 'ci/unit/test_nlt.sh --memcheck no' + + ' --system-ram-reserved 4 --server-debug WARN' + + ' --log-usage-import nltr.json' + + ' --log-usage-save nltr.xml' + + ' --class-name fault-injection fi', + with_valgrind: '', + always_script: 'ci/unit/test_nlt_post.sh', + testResults: 'nlt-junit.xml', + unstash_opt: true, + unstash_tests: false, + inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests', + image_version: 'el9.7', + prov_env_vars: 'VM_CPUS=14')) } post { always { - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6', + unitTestPost artifacts: ['nlt_logs/'], + testResults: 'nlt-junit.xml', + with_valgrind: '', + FI: true + discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE - recordIssues enabledForFailure: true, - /* ignore warning/errors from PMDK logging system */ - filters: [excludeFile('pmdk/.+')], - failOnError: false, - ignoreQualityGate: true, - qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'], - [threshold: 1, type: 'TOTAL_HIGH'], - [threshold: 1, type: 'NEW_NORMAL', unstable: true], - [threshold: 1, type: 'NEW_LOW', unstable: true]], - tools: [issues(pattern: 'nlt-errors.json', - name: 'Fault injection issues', - id: 'Fault_Injection'), - issues(pattern: 'nlt-client-leaks.json', - name: 'Fault injection leaks', - id: 'NLT_client')], - scm: 'daos-stack/daos' - junit testResults: 'nlt-junit.xml' - stash name: 'fault-inject-valgrind', - includes: '*.memcheck.xml', - allowEmpty: true - archiveArtifacts artifacts: 'nlt_logs/el8.fault-injection/', + archiveArtifacts artifacts: 'nlt_logs/fault-injection/', allowEmptyArchive: true job_status_update() } } - } // stage('Fault injection testing on EL 8.8') + } // stage('Fault injection testing') stage('Test RPMs on EL 8.6') { when { beforeAgent true @@ -1277,9 +1242,8 @@ pipeline { } // stages post { always { - valgrindReportPublish valgrind_stashes: ['el8-gcc-nlt-memcheck', - 'el8-gcc-unit-memcheck', - 'fault-inject-valgrind'] + valgrindReportPublish valgrind_stashes: ['nlt-memcheck', + 'el8-gcc-unit-memcheck'] job_status_update('final_status') jobStatusWrite(job_status_internal) } diff --git a/ci/docker_nlt.sh b/ci/docker_nlt.sh deleted file mode 100755 index a6d85eba771..00000000000 --- a/ci/docker_nlt.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Script for running NLT in a docker container. This is called from Jenkinsfile -# where needed, and is a cheat way of running setup_daos_server_helper under sudo -# and NLT itself from a single script. - -set -e - -set -x - -. utils/sl/setup_local.sh - -ps auwx -sudo --preserve-env=SL_PREFIX,SL_SPDK_PREFIX ./utils/setup_daos_server_helper.sh - -TMP_DIR=$(mktemp -d) - -cp utils/node_local_test.py utils/nlt_server.yaml .build_vars.json "$TMP_DIR" -cp src/tests/ftest/cart/util/cart_logparse.py src/tests/ftest/cart/util/cart_logtest.py "$TMP_DIR" -if [ -e nltr.json ] -then - cp nltr.json "$TMP_DIR" -fi - -pushd "$TMP_DIR" - -set +e - -sudo --preserve-env=VIRTUAL_ENV,PATH ./node_local_test.py \ - --no-root --memcheck no --system-ram-reserved 48 --server-debug WARN \ - --log-usage-import nltr.json --log-usage-save nltr.xml "$@" - -RC=$? -set -e -popd - -cp "$TMP_DIR"/*.json . -cp "$TMP_DIR"/*.xml . -sudo chmod -R o+r "$TMP_DIR"/nlt_logs -cp -r "$TMP_DIR"/nlt_logs . - -exit $RC diff --git a/ci/unit/test_nlt.sh b/ci/unit/test_nlt.sh index a5f50545a15..23e3bc8b549 100755 --- a/ci/unit/test_nlt.sh +++ b/ci/unit/test_nlt.sh @@ -13,6 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" # Copy over the install tree and some of the build tree. rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/ -# shellcheck disable=SC2029 -ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ - $(cat "$mydir/test_nlt_node.sh")" +ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \ + "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ + DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \ + bash -s -- $*" < "$mydir/test_nlt_node.sh" diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index aa6bcfe86b4..58a6159c65b 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -6,7 +6,6 @@ set -uex sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq' -sudo mkdir -p /mnt/daos # using mmap()'ed ULT stacks requires to bump system default if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then sudo sysctl vm.max_map_count=1000000 @@ -40,5 +39,12 @@ pip install /opt/daos/lib/daos/python/ # set high open file limit in the shell to avoid extra warning sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n -HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" ./utils/node_local_test.py --max-log-size 1700MiB \ - --dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all + +mkdir -p nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs +sudo chown jenkins:jenkins nlt_logs + +TMPDIR="$(pwd)/nlt_logs" \ + HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ + NO_PROXY="${DAOS_NO_PROXY:-}" \ + exec ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index c46a63dac2f..db39d9c7c3d 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -13,14 +13,18 @@ mkdir nlt_logs # Copy any log files. Use rsync filters here to allow us to specify # all files we want to copy, as it's much more flexible than using # standard wildcards. -rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \ + +# Assuming that node_local_test.py is run with --class-name, +# the logs will be in build/nlt_logs/ on the node. +rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \ --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ - --filter="include dnt_fi_*_logs" \ + --filter="include dnt_fi_*_logs" --filter="include */" \ --filter="exclude *" nlt_logs/ rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ --filter="include nltir.xml" --filter="include nltr.json" \ --filter="include nlt-junit.xml" --filter="exclude *" ./ + mkdir -p vm_test mv nlt-errors.json vm_test/ diff --git a/src/tests/ftest/cart/util/cart_logtest.py b/src/tests/ftest/cart/util/cart_logtest.py index e6149f28d32..e118b0e5758 100755 --- a/src/tests/ftest/cart/util/cart_logtest.py +++ b/src/tests/ftest/cart/util/cart_logtest.py @@ -226,7 +226,6 @@ def __init__(self, log_iter, quiet=False): self.fi_triggered = False self.fi_location = None self.skip_suffixes = [] - self.skip_substrings = [] self._tracers = [] self.ftest_mode = False @@ -445,10 +444,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks show = False if show and any(map(line.get_msg().endswith, self.skip_suffixes)): show = False - if show: - line_msg = line.get_msg().casefold() - if any(sub in line_msg for sub in self.skip_substrings): - show = False if show: # Allow WARNING or ERROR messages, but anything higher like assert should # trigger a failure. diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index d30dd9721bf..1473d8773d9 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -14,7 +14,6 @@ engines: - DAOS_MD_CAP=1024 - DAOS_STRICT_SHUTDOWN=1 - DAOS_TARGET_OVERSUBSCRIBE=1 - - ABT_STACK_OVERFLOW_CHECK=mprotect storage: - class: ram diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 7d62133ffd3..b02db128323 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -4897,13 +4897,6 @@ def sizeof_fmt(num, suffix='B'): if ignore_busy: lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'") - lto.skip_substrings.extend([ - 'sluggish ec boundary report from rank', - 'sluggish stable epoch reporting', - 'progress callback was not called for too long', - 'rpc failed; rc:', - ]) - try: lto.check_log_file(abort_on_warning=True, show_memleaks=show_memleaks, @@ -5792,7 +5785,7 @@ def _prep(self): # pylint: disable-next=no-member num_cores = len(os.sched_getaffinity(0)) - if num_cores < 20: + if num_cores < 14: max_child = 1 else: max_child = int(num_cores / 4 * 3) @@ -6492,12 +6485,14 @@ def run(wf, args): run_fi = False if args.perf_check or fi_test or fi_test_dfuse: - fs = subprocess.run([os.path.join(conf['PREFIX'], 'bin', 'fault_status')], check=False) + fi_env = os.environ.copy() + fi_env['PATH'] = f'{conf["PREFIX"]}/bin:{fi_env["PATH"]}' + fs = subprocess.run(['fault_status'], check=False, env=fi_env) print(fs) if fs.returncode == 0: run_fi = True else: - print("Unable to detect fault injection feature, skipping testing") + print("Unable to detect fault injection feature - skipping FI testing") if run_fi: args.server_debug = 'INFO' From 95c82c73f6048a210521d8dcb83e86aacf14d75a Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 1 Jun 2026 12:38:36 +0200 Subject: [PATCH 02/10] Test with original 2.6 nlt settings Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: false Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5ce89f832d1..7d318f59f88 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -778,8 +778,7 @@ pipeline { unitTest(timeout_time: 60, inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh' + - ' --system-ram-reserved 4' + - ' --max-log-size 1950MiB' + + ' --max-log-size 1700MiB' + ' --dfuse-dir /localhome/jenkins/' + ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + @@ -790,8 +789,8 @@ pipeline { testResults: 'nlt-junit.xml', unstash_opt: true, unstash_tests: false, - inst_rpms: unitPackages(target: 'el9'), - image_version: 'el9.7', + inst_rpms: unitPackages(target: 'el8'), + image_version: 'el8.8', prov_env_vars: 'VM_CPUS=14')) // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']], // skipPublishingChecks: true, From 18e4d326e64060ddeb9be06d2a57c1fa3b4f9279 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 3 Jun 2026 15:16:17 +0200 Subject: [PATCH 03/10] Test with serialized NLT Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-func-test-el8: true Signed-off-by: Tomasz Gromadzki --- utils/node_local_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index b02db128323..c82cb269072 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -5793,6 +5793,8 @@ def _prep(self): if self.single_process: max_child = 1 + max_child = 1 + print(f'Maximum number of spawned tests will be {max_child}') active = [] From 03997e285bfb7b03478d4b915af8fd230a584224 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 3 Jun 2026 20:37:23 +0200 Subject: [PATCH 04/10] Test with main test folder not in TMPFS additionally increas log size Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-func-test-el8: true Signed-off-by: Tomasz Gromadzki --- Jenkinsfile | 2 +- ci/unit/test_nlt_node.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7d318f59f88..58b7b8f4d7a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -778,7 +778,7 @@ pipeline { unitTest(timeout_time: 60, inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh' + - ' --max-log-size 1700MiB' + + ' --max-log-size 1900MiB' + ' --dfuse-dir /localhome/jenkins/' + ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 58a6159c65b..f9b45932da4 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -41,8 +41,8 @@ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n mkdir -p nlt_logs -sudo mount -t tmpfs tmpfs nlt_logs -sudo chown jenkins:jenkins nlt_logs +#sudo mount -t tmpfs tmpfs nlt_logs +#sudo chown jenkins:jenkins nlt_logs TMPDIR="$(pwd)/nlt_logs" \ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ From bf89b57091bdb3f2aa7a2592375ec8ee448f7916 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 3 Jun 2026 23:31:41 +0200 Subject: [PATCH 05/10] Avoid tmpfs for any file operation Use dfuse_dir rather than tempfile default to avoid landing on a tmpfs (e.g. nlt_logs) which does not support user xattrs on older kernels (RHEL 8 / kernel < 5.15), causing duns_create_path() to fail with DER_NOTSUPPORTED. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true Signed-off-by: Tomasz Gromadzki --- ci/unit/test_nlt_node.sh | 4 ++-- utils/node_local_test.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index f9b45932da4..58a6159c65b 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -41,8 +41,8 @@ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n mkdir -p nlt_logs -#sudo mount -t tmpfs tmpfs nlt_logs -#sudo chown jenkins:jenkins nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs +sudo chown jenkins:jenkins nlt_logs TMPDIR="$(pwd)/nlt_logs" \ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c82cb269072..e3379ce64bf 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -3651,7 +3651,11 @@ def test_chmod_ro(self): def test_with_path(self): """Test that dfuse starts with path option.""" - tmp_dir = tempfile.mkdtemp() + # Use dfuse_dir rather than tempfile default to avoid landing on a tmpfs + # (e.g. nlt_logs) which does not support user xattrs on older kernels + # (RHEL 8 / kernel < 5.15), causing duns_create_path() to fail with + # DER_NOTSUPPORTED. + tmp_dir = tempfile.mkdtemp(dir=self.conf.args.dfuse_dir) cont_path = join(tmp_dir, 'my-cont') create_cont(self.conf, self.pool, path=cont_path) From cf69b667d4222668c6aa2309187bf9de462f1281 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 8 Jun 2026 10:28:51 +0200 Subject: [PATCH 06/10] Fix linting issue Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true --- utils/cq/words.dict | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/cq/words.dict b/utils/cq/words.dict index 73c1a3d4858..19595dc6eab 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -461,6 +461,7 @@ timestamp timestamps tmp tmpfs +tempfile toolchain toplevel traceback From 045b9f71d03ed739b21b5db01abac0aa6e7d1c96 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 8 Jun 2026 14:07:39 +0200 Subject: [PATCH 07/10] Try to pin mercury to the last lp155 version Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true --- utils/rpms/daos.spec | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index d52fa5ba75e..75ee9e1b0cb 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -10,7 +10,11 @@ %else %global daos_build_args client test %endif -%global mercury_version 2.4.1 +%if (0%{?suse_version} >= 1500) +%global mercury_version 2.4.1-2.suse.lp155 +%else +%global mercury_version 2.4.1-2 +%endif %global libfabric_version 1.20 %global argobots_version 1.2-3 %global __python %{__python3} From 3854bbebf3c15b7ab7e876c7196da819e389e241 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 8 Jun 2026 15:04:39 +0200 Subject: [PATCH 08/10] Try to pin mercury to the last lp155 version 2nd Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true --- utils/rpms/daos.spec | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 75ee9e1b0cb..a8ae2b62c03 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -10,11 +10,8 @@ %else %global daos_build_args client test %endif -%if (0%{?suse_version} >= 1500) -%global mercury_version 2.4.1-2.suse.lp155 -%else %global mercury_version 2.4.1-2 -%endif +%global mercury_version_next 2.4.1-3 %global libfabric_version 1.20 %global argobots_version 1.2-3 %global __python %{__python3} @@ -41,6 +38,7 @@ BuildRequires: scons >= 2.4 %endif BuildRequires: libfabric-devel >= %{libfabric_version} BuildRequires: mercury-devel >= %{mercury_version} +BuildRequires: mercury-devel < %{mercury_version_next} BuildRequires: gcc-c++ %if (0%{?rhel} >= 8) %global openmpi openmpi @@ -137,6 +135,7 @@ Requires: openssl # of mercury, at which time the autoprov shared library version should # suffice Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric < %{mercury_version_next} %description @@ -168,6 +167,7 @@ Requires: libpmemobj >= 2.1.3-2 %endif Requires: libfabric >= %{libfabric_version} Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric < %{mercury_version_next} Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig Requires: numactl @@ -189,6 +189,7 @@ This package contains DAOS administrative tools (e.g. dmg). Summary: The DAOS client Requires: %{name}%{?_isa} = %{version}-%{release} Requires: mercury-libfabric >= %{mercury_version} +Requires: mercury-libfabric < %{mercury_version_next} Requires: libfabric >= %{libfabric_version} %if (0%{?suse_version} >= 1500) Requires: libfabric1 >= %{libfabric_version} From f402f18cb8fa550de10147c0e2114b4217642f1a Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 9 Jun 2026 17:28:45 +0200 Subject: [PATCH 09/10] Fix: proper reference job for FI tests Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 58b7b8f4d7a..7a081c1e545 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1021,7 +1021,7 @@ pipeline { testResults: 'nlt-junit.xml', with_valgrind: '', FI: true - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', + discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE archiveArtifacts artifacts: 'nlt_logs/fault-injection/', From f42638afd317029fe83e177234f19af24d2fad65 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 9 Jun 2026 22:22:55 +0200 Subject: [PATCH 10/10] Fix: proper reference job for FI tests 2nd Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-func-test-el9: true Skip-func-test-leap15: true Skip-test-el-8-rpms: true Skip-func-hw-test: true Skip-func-test-el8: true --- Jenkinsfile | 6 ++---- utils/rpms/daos.spec | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7a081c1e545..36ae932e4e3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1020,10 +1020,8 @@ pipeline { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', with_valgrind: '', - FI: true - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6', - scm: 'daos-stack/daos', - requiredResult: hudson.model.Result.UNSTABLE + FI: true, + referenceJobName: 'daos-stack/daos/release%252F2.6' archiveArtifacts artifacts: 'nlt_logs/fault-injection/', allowEmptyArchive: true job_status_update() diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 5ce8c121a9a..f35904cb6eb 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -135,7 +135,6 @@ Requires: openssl # of mercury, at which time the autoprov shared library version should # suffice Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric < %{mercury_version_next} @@ -168,7 +167,6 @@ Requires: libpmemobj >= 2.1.3-2 %endif Requires: libfabric >= %{libfabric_version} Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric < %{mercury_version_next} Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig Requires: numactl @@ -190,7 +188,6 @@ This package contains DAOS administrative tools (e.g. dmg). Summary: The DAOS client Requires: %{name}%{?_isa} = %{version}-%{release} Requires: mercury-libfabric >= %{mercury_version} -Requires: mercury-libfabric < %{mercury_version_next} Requires: libfabric >= %{libfabric_version} %if (0%{?suse_version} >= 1500) Requires: libfabric1 >= %{libfabric_version}