From 4148a9abe0f91160e39120755b3ffd3f33a2c1b8 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Mon, 4 May 2026 09:03:08 +0200 Subject: [PATCH 01/24] Fix H5Fflush failure in hdf5_v1 printer with MultiNest The hdf5_v1 printer aborted with "H5Fflush returned error value (-1)... not a file or file object" whenever a scanner explicitly called flush() on an auxiliary print stream. MultiNest hits this on its very first dumper callback, where it flushes the "txt" (Posterior) and "live" (LastLive) RA streams. Root cause: only the primary HDF5Printer opens the HDF5 file and sets its file_id member; auxiliary printers inherit location_id, RA_location_id,and metadata_location_id from the primary but never initialise their own file_id. HDF5Printer::empty_sync_buffers() called H5Fflush(file_id, ...)unconditionally, so on an aux printer it was flushing an uninitialised hid_t. Fix: route the flush through primary_printer->file_id. primary_printer defaults to "this" for the primary and is set to the actual primary in the aux constructor, so the same call is correct in both cases. This also makes the dumper's flush() calls actually useful (they now push HDF5's internal buffers to disk) instead of erroring out. Closes #495. --- Printers/src/printers/hdf5printer/hdf5printer.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Printers/src/printers/hdf5printer/hdf5printer.cpp b/Printers/src/printers/hdf5printer/hdf5printer.cpp index cd32d6eac6..0f861b3f7c 100644 --- a/Printers/src/printers/hdf5printer/hdf5printer.cpp +++ b/Printers/src/printers/hdf5printer/hdf5printer.cpp @@ -1377,8 +1377,11 @@ namespace Gambit printer_error().raise(LOCAL_INFO, errmsg.str()); } - // Tell the HDF5 library to flush everything to disk - herr_t err = H5Fflush(file_id, H5F_SCOPE_GLOBAL); + // Tell the HDF5 library to flush everything to disk. + // Auxiliary printers don't own a file handle (only the primary opens + // the HDF5 file in its constructor), so always flush via the primary's + // file_id. For the primary itself, primary_printer == this. + herr_t err = H5Fflush(primary_printer->file_id, H5F_SCOPE_GLOBAL); if(err<0) { std::ostringstream errmsg; From 343026c87f8af5c4409ffb285ec74247b42d0d10 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Mon, 4 May 2026 10:54:59 +0200 Subject: [PATCH 02/24] Added a CLAUDE.md and MINIMAL_BUILD.md --- CLAUDE.md | 17 ++++++ MINIMAL_BUILD.md | 140 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 CLAUDE.md create mode 100644 MINIMAL_BUILD.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..3b25d89c2f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,17 @@ +# Build + +If you need to build/rebuild GAMBIT to perform some test, you can check +**[MINIMAL_BUILD.md](MINIMAL_BUILD.md)** for an end-to-end build recipe. +It covers system packages, Python setup, cmake flags, backend ordering, +smoke test, and timing. + + +# Run example + +``` +OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml +``` + +(For multi-process MPI runs, re-configure cmake with `-DWITH_MPI=True` +and use `mpiexec -np ./gambit ...`.) + diff --git a/MINIMAL_BUILD.md b/MINIMAL_BUILD.md new file mode 100644 index 0000000000..dad6c35562 --- /dev/null +++ b/MINIMAL_BUILD.md @@ -0,0 +1,140 @@ +# Minimal GAMBIT build + +End-to-end recipe for a minimal GAMBIT build: ditching all the physics "Bits", not building any backends and only building a couple of scanners. + +The resulting `./gambit` binary can still be large and take several minutes to build +from scratch on 16 cores (the bottleneck is the single huge `Core/src/gambit.cpp` +translation unit). + +## 1. System packages + +``` +sudo apt-get update +sudo apt-get install -y \ + gfortran g++-12 gcc-12 \ + libeigen3-dev libgsl-dev libboost-all-dev libhdf5-dev \ + libblas-dev liblapack-dev pkg-config +``` + +Why each: +- `gfortran` — required (Fortran backends, BLAS interfaces). +- `g++-12 / gcc-12` — `cmake/backends.cmake` hard-codes + `--castxml-cc=g++-12` for the Prob3++ BOSS step (workaround for a + g++-13/clang-castxml conflict). Without g++-12 the prob3pp build + aborts at the BOSS/CastXML stage. +- `libeigen3-dev` — provides `/usr/include/eigen3`, used in the cmake + invocation below. +- `libgsl-dev`, `libboost-all-dev`, `libhdf5-dev` — runtime / build deps. +- `libblas-dev liblapack-dev` — without LAPACK, cmake's optional.cmake + errors out with "LAPACK shared library not found". + +## 2. Python packages + +By default cmake picks a different interpreter (e.g. `/usr/local/bin/python3`, +3.11 in this sandbox) and a different runtime libpython (the system one, +e.g. `libpython3.10`). That mismatch forces you to install runtime Python +packages (numpy, numba, pandas, scipy, mpmath) **twice** — once for each +interpreter — or `./gambit -h` aborts with `ModuleNotFoundError: numpy`. + +Avoid this by pointing cmake at a single Python via `PYTHON_EXECUTABLE`, +`PYTHON_INCLUDE_DIR` and `PYTHON_LIBRARY` (see the cmake step below). Then +install the packages once for **that** interpreter: + +``` +pip install --user numpy numba pandas scipy mpmath +``` + +(In the Claude Code sandbox the chosen interpreter is +`/usr/local/bin/python3` = 3.11.15, so this `pip --user` install is the +one needed.) + +## 3. Configure (cmake) + +``` +mkdir -p build && cd build +cmake -DPYTHON_EXECUTABLE=/usr/local/bin/python3 \ + -DPYTHON_INCLUDE_DIR=/usr/include/python3.11 \ + -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.11.so \ + -DEIGEN3_INCLUDE_DIR=/usr/include/eigen3 \ + -DWITH_MPI=False \ + -DWITH_ROOT=False \ + -Ditch="ColliderBit;CosmoBit;DarkBit;NeutrinoBit;ObjectivesBit;FlavBit;DecayBit;SpecBit;PrecisionBit;Mathematica" \ + .. +``` + +Notes: +- `PYTHON_EXECUTABLE`/`PYTHON_INCLUDE_DIR`/`PYTHON_LIBRARY` make cmake use + the same Python (3.11 here) for both build-time scripts and the runtime + libpython — eliminating the dual pip install. Confirm by looking for + matching versions in the cmake output: + ``` + Using Python interpreter version 3.11.15 for build. + Using Python libraries version 3.11.15 for Python backend support. + ``` + If you see a `NOTE: You are using different Python versions for the + interpreter and the libraries!` message, the flags weren't picked up + (likely because `CMakeCache.txt` from a previous run is sticky — wipe + the build dir and rerun). +- `itch` lists modules to **exclude** from the build. +- `WITH_MPI=False` and `WITH_ROOT=False` keep the build minimal. +- This first cmake call takes ~75 seconds (it clones pybind11, runs many + feature checks). + +## 4. Build scanners, then re-cmake, then build gambit + +``` +make diver +make multinest +cmake .. # re-detect newly built scanner libraries +make gambit -j$(nproc) +``` + +Expected timings on 16 cores: +- `diver`: ~1 min (mostly download) +- `multinest`: ~1 min (mostly download) +- `cmake ..`: ~75 s +- `make gambit -j16`: Can take ~25 min (gambit.cpp.o alone can take ~20 min) + +The whole chain runs cleanly start-to-finish if build is done in the order above. + +### Long-running build tip (Claude Code) + +The `Bash` tool defaults to a 10-minute timeout, but exceeding it does +**not** kill the command — the tool returns "running in background" and +the build continues. You can poll with subsequent `ps`/`tail` calls. Or +raise the cap explicitly via the `BASH_MAX_TIMEOUT_MS` env var in +`~/.claude/settings.json` (e.g. set to `3600000` for 60 min): + +```json +{ + "env": { "BASH_MAX_TIMEOUT_MS": "3600000" } +} +``` + +To watch a verbose log instead of the silent default, prepend `VERBOSE=1`: + +``` +make gambit -j$(nproc) VERBOSE=1 +``` + +## 5. Smoke test + +``` +./gambit --version +./gambit -h +./gambit -d -f yaml_files/spartan.yaml # dry-run, prints functor tree +``` + +A successful dry-run prints the resolved dependency tree. + +## 6. Run example + +Without MPI: + +``` +OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml +``` + +(The original `mpiexec -np N ./gambit ...` form requires `WITH_MPI=True` +at configure time — re-run cmake with that flag if needed.) + From 4d6eebcc7b1574a28e5099eedbd382b4307e870c Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Mon, 4 May 2026 10:55:45 +0200 Subject: [PATCH 03/24] Added file with list of hdf5 printer issues --- hdf5_printer_issues.md | 195 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 hdf5_printer_issues.md diff --git a/hdf5_printer_issues.md b/hdf5_printer_issues.md new file mode 100644 index 0000000000..77a2da70f0 --- /dev/null +++ b/hdf5_printer_issues.md @@ -0,0 +1,195 @@ +# HDF5 printer audit — synthesized findings + +Audit of the v1 (`Printers/src/printers/hdf5printer/`) and v2 +(`Printers/src/printers/hdf5printer_v2/`) HDF5 printer implementations, +performed after the file_id fix for issue #495. Findings sorted by +severity and grouped where v1 and v2 share a class of bug. + +## Critical (likely to bite real runs) + +### [v2] `final_size` is uninitialised on workers in `HDF5Printer2::finalise()` +`hdf5printer_v2.cpp:1958-2010`. `final_size` is set only inside +`if(myRank==0)` at lines 1959-1967, but the loop at 1996-2010 runs on +**every** rank and calls `(*it)->extend_all_datasets_to(final_size)` at +line 2008. Workers pass garbage. There's no rank-0 guard around the +loop. This will misbehave any time a worker has a non-synchronised aux +buffer registered (which any scanner using RA aux streams over MPI +produces). The whole 1996-2010 block belongs inside `if(myRank==0)`, +alongside the matching block at 1930-1952. + +### [v1] `errorsOff()` / `errorsOn()` nesting permanently silences HDF5 errors +`hdf5tools.cpp:369-386`. `errorsOff()` saves the old handler into +module-globals `old_func` / `old_client_data`. The function at line 377 +calls `errorsOff()` again *while errors are already off* — so it saves +NULL into those globals. The matching `errorsOn()` at line 386 then +restores NULL, leaving HDF5 error reporting permanently disabled for the +rest of the process. Same pattern at lines 395/408 (less reachable). Any +subsequent HDF5 problem will fail silently. + +### [v1] Aux printers' uninitialised `hid_t` members beyond `file_id` +`hdf5printer.hpp:336-345`. None of `file_id`, `group_id`, `RA_group_id`, +`metadata_id` (and arguably the `*_location_id` set) have default +initialisers. The aux constructor only assigns `location_id` / +`RA_location_id` / `metadata_location_id`. The `H5Fflush` issue we just +fixed (#495) is one example — others are latent. Suggests a wider fix: +either default-init all to `-1` (HDF5's invalid handle), or always go +through `primary_printer->X` in code that runs on aux instances. + +### [v1] `openGroup` returns after creating only the first non-existent path component +`hdf5tools.cpp:540-574`. On the first iteration where `H5Gopen2` fails +and `H5Gcreate2` succeeds, the inner `else` branch at line 564 returns +immediately, exiting the loop before processing the remaining +components. Latent because v1 only ever opens single-level paths +(`/data`, `/data/RA`, etc. as separate calls), but the function would +silently mis-create any user-supplied deeper `group:` option. + +### [v1] Buffer destructors do not close their HDF5 datasets +`VertexBufferNumeric1D_HDF5.hpp:294-303`, +`DataSetInterfaceBase.hpp:189-204`. Destructors are deliberately empty +(close calls commented out, with TODO about copy semantics). Cleanup +relies entirely on `finalise()` running. Any abnormal exit before +finalise — a constructor exception elsewhere, an error during +synchronisation, an uncaught signal — leaks dataset handles and likely +loses unflushed data when `H5Fclose` auto-closes them later. + +### [v2] `H5Tclose` missing in `HDF5DataSet::write_buffer` — leaks one type handle per flush +`hdf5printer_v2.hpp:304-311`. `hid_t dtype = H5Dget_type(get_dset_id())` +is opened but never closed. The companion `write_RA_buffer` at lines +438/467 *does* close it. Long scans with many datasets and many flushes +accumulate handles until HDF5's table or the OS file-handle limit gives. + +### [v2] Stack-local `RAbuffer` added to member `aux_buffers`, then destroyed +`hdf5printer_v2.cpp:1973, 1994`. `HDF5MasterBuffer RAbuffer(...)` is a +local; `add_aux_buffer(RAbuffer)` pushes its address into the +`aux_buffers` vector. When `finalise` returns, the pointer dangles. +Currently dormant (no second `finalise` call, empty destructor at 1844) +but a bug-magnet for any future change. + +## Major + +### [v1] `masterWaitForAll(FINAL_SYNC)` deadlocks if any worker dies before reaching it +`hdf5printer.cpp:987` plus `Utils/src/mpiwrapper.cpp:163-188`. The +barrier is plain blocking `Recv`s on rank 0, no Iprobe-with-timeout +(unlike `allWaitForMasterWithFunc`). The TODO comment at line 988 +acknowledges this. Same risk on the resume-mode `myComm.Barrier()` at +line 610 if `prepare_and_combine_tmp_files` raises on rank 0. + +### [v1] `exit(1)` from inside the constructor after `closeFile` +`hdf5printer.cpp:467-469`. On any restart-without-`-r` against an +existing output file with the requested group missing — a normal user +mistake — rank 0 calls `exit(1)`, bypassing MPI shutdown. Workers +blocked on the upcoming `myComm.Barrier()` at line 610 hang forever. + +### [v2] `MPI_flush_to_rank` / `MPI_recv_all_buffers` / `MPI_request_buffer_data` are dead code +`hdf5printer_v2.cpp:664-810`, `hdf5printer_v2.hpp:910-1001`, including +`recv_counter` / `send_counter` debug scaffolding. No callers. +`finalise()` uses the `gather_and_print` / `Gatherv` path instead. +Either half-finished refactor leftover or a missed wiring that should be +using this faster path. Recommend either deleting or wiring up — the +deadlock-warning comment is currently misleading. + +### [v2] `HDF5Printer2::flush()` does not gather over MPI; only the calling rank's buffermaster is touched +`hdf5printer_v2.cpp:1876-1879`. So between scanner-driven `flush()` +calls, RA aux data on workers stays in worker memory until `finalise()`. +Different from v1's effective semantics. Any scanner that calls +`aux_stream->flush()` for durability (MultiNest's dumper at +`multinest_3.12/multinest.cpp:362-363` is exactly this) will *not* get +RA data from non-rank-0 ranks to disk until the scan ends. + +### [v1] `synchronise_buffers` underflows if `sync_pos == 0` +`hdf5printer.cpp:1307`. `const unsigned long sync_pos = get_sync_pos()-1;` +becomes `ULONG_MAX`. Currently never reached because `check_for_new_point` +only triggers it after a real point has been seen, but the invariant is +not enforced anywhere. + +### [v1] Unbounded growth of `postpone_write_queue_and_locs` +`VertexBufferNumeric1D_HDF5.hpp:63, 541-544`. RA writes that can't yet +be matched to a sync point get pushed without a cap. The MPI-based +queue-sending that was supposed to bound this has been disabled +(commented out) and the comments weren't updated. + +### [v1] `_print_metadata` always passes `resume=false` +`hdf5printer.cpp:1681-1691`. Re-creates datasets on every call. +Currently only called once, at `finalise()`, so dormant — but any new +caller will trip the "Dataset with same name may already exist" path +immediately. + +### [v1] Aux constructor doesn't validate `dynamic_cast` result +`hdf5printer.cpp:683`. If the cast fails (HDF5 aux attached to non-HDF5 +primary), `primary_printer` becomes nullptr and the very next +`set_resume(primary_printer->get_resume())` segfaults. + +### [v2] `print_metadata` underflow +`hdf5printer_v2.cpp:609-616`. `std::size_t size = get_next_metadata_position(); if(sameset) size --;` +— if the GAMBIT dataset doesn't exist yet, `get_next_metadata_position()` +returns 0, `--` underflows to `SIZE_MAX`. Currently protected by the +call-site `if (use_metadata)` at 1948, but the function is unsafe. + +### [v1] Iterators decremented past `begin()` in combine size-trim loops +`hdf5_combine_tools.cpp:539-545`, `807-813`. Technically UB on +`std::vector::iterator` when all entries are false. Works on most STLs. + +### [v1] `H5Dget_type` not closed in combine tools +`hdf5_combine_tools.cpp:949-950, 955-956, 1144-1145, 1273, 1278`. +Type-handle leak per file/parameter combined; contributes to "too many +open objects" warnings on big merges. + +## Minor / Notes + +A handful more across both files: + +- v1 dead/stale state members (`current_dset_position`, `previous_points`, + `global=false` field never assigned). +- v1 `errorsOff` / `errorsOn` use static globals → fundamentally not + thread-safe. +- v1 hardcoded `BUFFERLENGTH=100` and `MAX_PPIDPAIRS=1000`. +- v1 `combine_output_py` is dead code (uses a Python helper that may not + exist). +- v2 constructor's resource-leak risk if `printer_error()` raises + mid-setup (no RAII). +- v2 generous `std::cout` / `std::cerr` use where `logger()` would be + appropriate (e.g. lines 1514, 1539, 1543, 1705, 1713, 1716, 1745, + 1762, 1965, 2018). +- v2 narrowing conversions between `std::size_t` and `int` in MPI size + handling. +- Both v1 and v2 shell out to `popen("rm ...")` and `popen("cp ...")` + rather than calling `std::remove` / filesystem APIs. Brittle on paths + with shell metacharacters. +- v2 `MAX_BUFFER_SIZE = 100000` `T buffer[MAX_BUFFER_SIZE]` is a stack + array (~800 KB for `T=double`). Fine on Linux defaults, fragile + elsewhere. +- v2 `H5Tequal(in-memory, on-disk)` check can fire spuriously on resume + because on-disk types aren't normalised through `H5Tget_native_type` + like v1 does. +- v2 `_isvalid` companion existence not separately tracked from value + dataset (TODO at `hdf5printer_v2.hpp:888`). + +## What to prioritize fixing + +1. **v2 `final_size` / `extend_all_datasets_to` on workers** — same shape + as the bug we just fixed: a finalise-time member-not-set-on-aux-path + issue that hides because some scenarios mask the UB. Will hit any MPI + scan with v2 + RA aux streams. +2. **v1 `errorsOff` / `errorsOn` nesting** — silently disables your + safety net for HDF5 errors. Trivial to fix (save state in a local). +3. **v1 uninit `hid_t` cluster** — same root cause as #495. A small + refactor to default-init or to collapse aux/primary handles into a + single struct would prevent another year of "another field wasn't set + on aux printers" bugs. +4. **v2 `H5Tclose` leak in `write_buffer`** — slow-burn but unbounded. +5. **v1 buffer-destructor cleanup** — would matter on any abnormal + shutdown, including signal-driven ones from a scheduler killing a + job. + +Everything below "Major" is good to file as low-priority issues but +probably not urgent. + +## Theme + +A lot of the fragility across both printers comes from sharing state +between primary and auxiliary printer instances via raw pointers and +ad-hoc "use this one, not that one" rules. A refactor that makes the +auxiliary's role purely a *view* over a shared `HDF5File` object (so +there are no per-printer file/group handles to forget to initialise) +would eliminate this whole class of bug. Big change, but probably worth +scoping if you're touching this code anyway. From cef688aca483efb15a9dec37303a1042d806df44 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 09:17:56 +0000 Subject: [PATCH 04/24] [hdf5_v2] Guard finalise() RA-flush block with myRank==0 to fix uninit final_size on workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue ----- In HDF5Printer2::finalise() (Printers/src/printers/hdf5printer_v2/ hdf5printer_v2.cpp), the local `std::size_t final_size` was declared unconditionally but only assigned inside an `if(myRank==0)` block (via `buffermaster.get_next_free_position()`). Immediately after, the loop that flushes unsynchronised (random-access) aux buffers and calls `(*it)->extend_all_datasets_to(final_size)` ran on **every** rank. Worker ranks therefore passed an uninitialised value into `extend_all_datasets_to`, which is undefined behaviour. In practice this hit any MPI scan using the v2 hdf5 printer with auxiliary RA streams (i.e. anything beyond a single-rank run): worker ranks with non-empty `aux_buffers` would attempt local writes / dataset extensions that they have no business performing — only rank 0 owns the output file. The matching synchronised-buffer flush block earlier in finalise() is already correctly gated by `if(myRank==0)`, so the fix is to apply the same guard symmetrically. Fix --- - Move the `final_size` query and the entire RA-flush / extend_all_datasets_to loop inside `if(myRank==0)`. Worker ranks have already shipped their RA buffer contents to rank 0 via the preceding `gather_and_print(...)` call, which remains unguarded (it is a collective and must be called by all ranks). - Tighten the scope of the `final_size` declaration so it lives only inside the rank-0 block where it is meaningful. - Move the `add_aux_buffer(RAbuffer)` call inside the rank-0 block too, since the RAbuffer only contains gathered data on rank 0 and the loop that consumes `aux_buffers` is now rank-0-only. (Note: RAbuffer being a stack local appended to the member `aux_buffers` vector is a separate latent issue, tracked in `hdf5_printer_issues.md`.) - Added a comment explaining why the block is rank-0-only. Verification ------------ - Built GAMBIT with `-DWITH_MPI=True`. - Ran the spartan example single-rank (`OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml`): successful, "Final dataset size is 9003". - Ran spartan with 2 MPI ranks (`mpiexec -np 2 ./gambit -rf yaml_files/spartan.yaml`): successful, "Final dataset size is 9001". Inspected the output HDF5: all 34 datasets under /data have identical length (9001), confirming sync and RA datasets were extended consistently and no output was corrupted. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../hdf5printer_v2/hdf5printer_v2.cpp | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp index 410b27abe3..7f40fea80d 100644 --- a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp +++ b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp @@ -1955,16 +1955,6 @@ namespace Gambit std::ostringstream buffer_nonempty_report; bool buffer_nonempty_warn(false); - std::size_t final_size; - if(myRank==0) - { - /// Need to know final nominal dataset size to ensure unsynchronised datasets match synchronised ones. - buffermaster.lock_and_open_file(); - final_size = buffermaster.get_next_free_position(); - buffermaster.close_and_unlock_file(); - std::cout<<"Final dataset size is "<is_synchronised()) + /// Need to know final nominal dataset size to ensure unsynchronised datasets match synchronised ones. + buffermaster.lock_and_open_file(); + std::size_t final_size = buffermaster.get_next_free_position(); + buffermaster.close_and_unlock_file(); + std::cout<<"Final dataset size is "<flush(); - // Check if everything managed to flush! - if(not (*it)->all_buffers_empty()) + if(not (*it)->is_synchronised()) { - buffer_nonempty_report<<(*it)->buffer_status(); - buffer_nonempty_warn = true; + (*it)->flush(); + // Check if everything managed to flush! + if(not (*it)->all_buffers_empty()) + { + buffer_nonempty_report<<(*it)->buffer_status(); + buffer_nonempty_warn = true; + } + // Make sure final dataset size is correct for the unsynchronised buffers + (*it)->extend_all_datasets_to(final_size); } - // Make sure final dataset size is correct for the unsynchronised buffers - (*it)->extend_all_datasets_to(final_size); } } From 896c3b27a5258acdc14ec04b2370a35c07ca7474 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 09:18:03 +0000 Subject: [PATCH 05/24] Document optional MPI install in MINIMAL_BUILD.md Add an "Optional: MPI" subsection to step 1 covering the `libopenmpi-dev openmpi-bin` install and the corresponding `-DWITH_MPI=True` cmake flag, plus an MPI run example in step 6 (including the `--allow-run-as-root --oversubscribe` flags that are typically needed in containerised dev environments). Needed for testing any printer / scanner code paths gated on `#ifdef WITH_MPI` (e.g. the hdf5_v2 finalise rank-0 guard fixed in the preceding commit). https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- MINIMAL_BUILD.md | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/MINIMAL_BUILD.md b/MINIMAL_BUILD.md index dad6c35562..469875dc66 100644 --- a/MINIMAL_BUILD.md +++ b/MINIMAL_BUILD.md @@ -28,6 +28,25 @@ Why each: - `libblas-dev liblapack-dev` — without LAPACK, cmake's optional.cmake errors out with "LAPACK shared library not found". +### Optional: MPI + +To build GAMBIT with MPI support (required to run multi-process scans +with `mpiexec -np N ./gambit ...`, and to exercise any printer / scanner +code paths gated on `#ifdef WITH_MPI`), additionally install OpenMPI: + +``` +sudo apt-get install -y libopenmpi-dev openmpi-bin +``` + +This provides `mpiexec`, `mpicc`, `mpic++`, and the headers/libs that +cmake's MPI detection picks up. Then add `-DWITH_MPI=True` to the cmake +invocation in step 3. Verify the install with: + +``` +mpiexec --version # should report Open MPI 4.x +which mpic++ # /usr/bin/mpic++ +``` + ## 2. Python packages By default cmake picks a different interpreter (e.g. `/usr/local/bin/python3`, @@ -76,7 +95,10 @@ Notes: (likely because `CMakeCache.txt` from a previous run is sticky — wipe the build dir and rerun). - `itch` lists modules to **exclude** from the build. -- `WITH_MPI=False` and `WITH_ROOT=False` keep the build minimal. +- `WITH_MPI=False` and `WITH_ROOT=False` keep the build minimal. Set + `-DWITH_MPI=True` instead if you installed OpenMPI per the optional + step in section 1; cmake will then pick up `mpic++` automatically and + compile the `#ifdef WITH_MPI` branches in the printer / scanner code. - This first cmake call takes ~75 seconds (it clones pybind11, runs many feature checks). @@ -135,6 +157,18 @@ Without MPI: OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml ``` -(The original `mpiexec -np N ./gambit ...` form requires `WITH_MPI=True` -at configure time — re-run cmake with that flag if needed.) +With MPI (build configured with `-DWITH_MPI=True`): + +``` +OMP_NUM_THREADS=1 mpiexec -np 2 ./gambit -rf yaml_files/spartan.yaml +``` + +If `mpiexec` complains about running as root in a container, add +`--allow-run-as-root` (and, if you hit "not enough slots", also +`--oversubscribe`): + +``` +OMP_NUM_THREADS=1 mpiexec --allow-run-as-root --oversubscribe -np 2 \ + ./gambit -rf yaml_files/spartan.yaml +``` From dd0109b7305e2f65ddd703b5e05c2a5318097c7e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 09:18:36 +0000 Subject: [PATCH 06/24] Ignore downloaded_tarball_paths.txt build artifact Cmake writes downloaded_tarball_paths.txt at the repo root listing the paths of scanner / backend tarballs it downloaded into ScannerBit/downloaded/ and Backends/downloaded/ during a build. It is purely a build-time bookkeeping file and should not be tracked. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 59f3cd1142..5b5dddbd8b 100755 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,7 @@ Backends/installed/** ScannerBit/downloaded/** ScannerBit/installed/** ScannerBit/lib/plugin_libraries.list +downloaded_tarball_paths.txt # GUM files *.mug From 7bd5c1b4a9307e0a25bc708ff89a0592913cbf30 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 14:21:08 +0000 Subject: [PATCH 07/24] [hdf5_v1] Fix errorsOff/errorsOn nesting that permanently silenced HDF5 errors Issue ----- In Printers/src/printers/hdf5printer/hdf5tools.cpp, errorsOff() saved the current HDF5 error handler into module-globals old_func / old_client_data and then installed a NULL handler; errorsOn() restored from those globals. This is fine for an isolated pair, but it breaks when errorsOff() is called while errors are already off: errorsOff(); // saves real handler -> globals; installs NULL ... errorsOff(); // saves NULL (current handler) -> globals; installs NULL ... errorsOn(); // restores globals = NULL -> errors stay off forever There is exactly one such nested case in the v1 printer code today (hdf5tools.cpp lines 369/377/386 in the dataset corruption-recovery helper), and it permanently disables HDF5 error reporting for the rest of the process whenever it is hit. Any subsequent HDF5 problem then fails silently. There are also ~25 paired call sites across hdf5tools.cpp and hdf5_combine_tools.cpp that work fine in isolation but would corrupt the global state if any of them were ever wrapped in another errorsOff() / errorsOn() block. Fix --- Make errorsOff() / errorsOn() idempotent with a bool guard: - errorsOff(): if errors are already off, do nothing (in particular, do not call H5Eget_auto2, which would overwrite the saved handler with the NULL we installed ourselves). - errorsOn(): if errors are already on (i.e. unbalanced call), do nothing rather than blindly restoring whatever is in the saved slots. The bool / handler state is moved into an unnamed namespace at file scope, giving it internal linkage. Functionally equivalent to the existing namespace-scope globals for this codebase (no other TU references them), but it (a) makes intent explicit -- these are strictly the implementation detail of the two functions just below them -- and (b) prevents any future accidental ODR collision on the short generic names. A counter-based nesting scheme was considered but rejected: no real caller pairs an inner errorsOn() with an inner errorsOff(), so all the counter would do is silently absorb the one buggy nested call we already have. The bool keeps the logic minimal and matches what every existing caller actually wants. Note: the underlying state is still a module global, so errorsOff / errorsOn remain not thread-safe. That is a separate (lower-priority) issue tracked in hdf5_printer_issues.md. Verification ------------ - Rebuilt GAMBIT (-DWITH_MPI=True) cleanly. - Single-rank v1 spartan run: success. - 2-rank MPI v1 spartan run: success. - Resume run via combine-routines path through hdf5_combine_tools.cpp (which has the bulk of the errorsOff / errorsOn call sites): success. No HDF5 warnings or errors in any of the runs. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../src/printers/hdf5printer/hdf5tools.cpp | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/Printers/src/printers/hdf5printer/hdf5tools.cpp b/Printers/src/printers/hdf5printer/hdf5tools.cpp index 346e4ebfa6..f47200a3ae 100644 --- a/Printers/src/printers/hdf5printer/hdf5tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5tools.cpp @@ -668,9 +668,18 @@ namespace Gambit { /// Close hdf5 group SIMPLE_CALL(hid_t, closeGroup, hid_t, H5Gclose, "close", "group", "group") - /// global error variables (handler) - H5E_auto2_t old_func; - void *old_client_data; + /// State for errorsOff() / errorsOn(). File-local: only the two + /// functions below should ever touch these. The bool guard ensures + /// that a redundant errorsOff() called while errors are already off + /// does NOT overwrite the saved handler with the NULL handler we + /// installed ourselves -- otherwise the matching errorsOn() would + /// "restore" NULL and silently disable HDF5 error reporting for + /// the rest of the process. + namespace { + bool errors_are_off = false; + H5E_auto2_t saved_func = nullptr; + void *saved_client_data = nullptr; + } // FIXME: This caused compile problems on LISA cluster (CW) /// Silence error report (e.g. while probing for file existence) @@ -684,18 +693,21 @@ namespace Gambit { /// and let me know if it works :) void errorsOff() { + if (errors_are_off) return; // already off; don't clobber saved state /* Save old error handler */ - H5Eget_auto2(H5E_DEFAULT, &old_func, &old_client_data); - + H5Eget_auto2(H5E_DEFAULT, &saved_func, &saved_client_data); /* Turn off error handling */ H5Eset_auto2(H5E_DEFAULT, NULL, NULL); + errors_are_off = true; } /// Restore error report void errorsOn() { + if (!errors_are_off) return; // unbalanced / already on; no-op /* Restore previous error handler */ - H5Eset_auto2(H5E_DEFAULT, old_func, old_client_data); + H5Eset_auto2(H5E_DEFAULT, saved_func, saved_client_data); + errors_are_off = false; } /// @{ Dataset manipulations From 592ad6dfc4e1ae189a449d9f07191db7ea779db3 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Mon, 4 May 2026 10:54:59 +0200 Subject: [PATCH 08/24] Added a CLAUDE.md and MINIMAL_BUILD.md --- MINIMAL_BUILD.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MINIMAL_BUILD.md b/MINIMAL_BUILD.md index 469875dc66..2b2c58c7c7 100644 --- a/MINIMAL_BUILD.md +++ b/MINIMAL_BUILD.md @@ -95,10 +95,12 @@ Notes: (likely because `CMakeCache.txt` from a previous run is sticky — wipe the build dir and rerun). - `itch` lists modules to **exclude** from the build. + - `WITH_MPI=False` and `WITH_ROOT=False` keep the build minimal. Set `-DWITH_MPI=True` instead if you installed OpenMPI per the optional step in section 1; cmake will then pick up `mpic++` automatically and compile the `#ifdef WITH_MPI` branches in the printer / scanner code. + - This first cmake call takes ~75 seconds (it clones pybind11, runs many feature checks). From 8d905d6179e39b2956dcb516989cd8977e5cc95b Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Mon, 4 May 2026 09:03:08 +0200 Subject: [PATCH 09/24] Fix H5Fflush failure in hdf5_v1 printer with MultiNest The hdf5_v1 printer aborted with "H5Fflush returned error value (-1)... not a file or file object" whenever a scanner explicitly called flush() on an auxiliary print stream. MultiNest hits this on its very first dumper callback, where it flushes the "txt" (Posterior) and "live" (LastLive) RA streams. Root cause: only the primary HDF5Printer opens the HDF5 file and sets its file_id member; auxiliary printers inherit location_id, RA_location_id,and metadata_location_id from the primary but never initialise their own file_id. HDF5Printer::empty_sync_buffers() called H5Fflush(file_id, ...)unconditionally, so on an aux printer it was flushing an uninitialised hid_t. Fix: route the flush through primary_printer->file_id. primary_printer defaults to "this" for the primary and is set to the actual primary in the aux constructor, so the same call is correct in both cases. This also makes the dumper's flush() calls actually useful (they now push HDF5's internal buffers to disk) instead of erroring out. Closes #495. --- yaml_files/spartan_issue_495.yaml | 456 ++++++++++++++++++++++++++++++ 1 file changed, 456 insertions(+) create mode 100644 yaml_files/spartan_issue_495.yaml diff --git a/yaml_files/spartan_issue_495.yaml b/yaml_files/spartan_issue_495.yaml new file mode 100644 index 0000000000..b87d435cb7 --- /dev/null +++ b/yaml_files/spartan_issue_495.yaml @@ -0,0 +1,456 @@ +########################################################################## +## GAMBIT configuration for an ultra-minimal scan of a toy model. +## +## Only needs ExampleBit_A and a scanner (either one of the built-ins like +## the random sampler or TWalk, or an external like Diver, MultiNest, +## GreAT or Polychord). +########################################################################## + + +Parameters: + # In this example we will simply be fitting the mean and standard deviation of a normal distribution. + NormalDist: + # Example of using simple priors: + mu: + prior_type: flat + range: [15, 30] + sigma: + prior_type: flat + range: [0, 5] + + # If using a more complicated prior specified in the 'Priors' section below, + # use the 'in_priors' keyword in this section: + # mu: in_priors + # sigma: in_priors + + +Priors: + + # None needed: simple priors are already specified in the 'Parameters' section above. + + # If defining a more complicated prior, make sure to tag the relevant parameters with the + # keyword 'in_priors' in the 'Parameters' section above. Below is an example of a + # 2D gaussian prior for the two parameters 'mu' and 'sigma': + # + # my_prior: + # parameters: ["NormalDist::mu", "NormalDist::sigma"] + # prior_type: gaussian + # mu: [18, 5] + # cov_matrix: [[9.0, 0.0], [0.0, 1.0]] + # # sigma: [3.0, 1] + + +Printer: + + # printer: sqlite + # options: + # output_file: "results.sql" + # table_name: "data" + # buffer_length: 1000 + # delete_file_on_restart: true + + # printer: hdf5 + # options: + # output_file: "results.hdf5" + # group: "/data" + # delete_file_on_restart: true + # buffer_length: 1000 + # # disable_autorepair: true + + printer: hdf5_v1 + options: + output_file: "gambit_output.hdf5" + group: "/data" + delete_file_on_restart: true + disable_combine_routines: true + + # printer: ascii + # options: + # output_file: "results.dat" + # buffer_length: 10 + # delete_file_on_restart: true + # print_debug_data: true + + # printer: none + + +Scanner: + + use_scanner: multinest + + scanners: + + random: + plugin: random + like: LogLike + point_number: 1000 + + grid: + # plugin: pygrid + plugin: grid + #version: ">=1.0" + like: LogLike + grid_pts: [5, 3] + + de: + plugin: diver + like: LogLike + NP: 500 + initial_guesses: + parameters: [NormalDist::sigma, NormalDist::mu] + values: + - [2.5, 25] + - [2, 24] + - [1.5, 22] + + multinest: + plugin: multinest + like: LogLike + nlive: 500 + #tol: 0.0001 + tol: 0.1 + + mcmc: + plugin: great + like: LogLike + nTrialLists: 5 + nTrials: 10000 + + twalk: + plugin: twalk + like: LogLike + sqrtR: 1.001 + + polychord: + plugin: polychord + like: LogLike + print_parameters_in_native_output: true + tol: 0.1 + + pso: + plugin: jswarm + like: LogLike + NP: 400 + adaptive_phi: true + adaptive_omega: true + verbosity: 2 + + minuit2: + plugin: minuit2 + like: LogLike + tolerance: 0.0001 + precision: 0.0001 + max_loglike_calls: 100000 + max_iterations: 100000 + algorithm: combined # simplex, combined, scan, fumili, bfgs, migrad + print_level: 1 + strategy: 2 + start: + NormalDist::mu: 25. + NormalDist::sigma: 2.5 + step: + NormalDist::mu: 5. + NormalDist::sigma: 1. + + scipy_basin_hopping: + like: LogLike + plugin: scipy_basin_hopping + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + # n_runs: 4 + x0: + NormalDist::mu: 25. + NormalDist::sigma: 2.5 + niter: 100 + T: 1.0 + stepsize: 0.5 + minimizer_kwargs: + method: "L-BFGS-B" + interval: 50 + disp: true + target_accept_rate: 0.5 + stepwise_factor: 0.9 + + scipy_differential_evolution: + like: LogLike + plugin: scipy_differential_evolution + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + # n_runs: 4 + strategy: 'best1bin' + maxiter: 1000 + popsize: 15 + tol: 0.01 + mutation: [0.5, 1] + recombination: 0.7 + disp: false + polish: true + init: 'latinhypercube' + atol: 0 + updating: 'immediate' + x0: + NormalDist::mu: 25. + NormalDist::sigma: 2.5 + + scipy_direct: + like: LogLike + plugin: scipy_direct + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + # n_runs: 4 + eps: 0.0001 + # maxfun: 2000 + maxiter: 1000 + locally_biased: true + vol_tol: 1e-16 + len_tol: 1e-6 + + scipy_dual_annealing: + like: LogLike + plugin: scipy_dual_annealing + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + # n_runs: 4 + maxiter: 1000 + initial_temp: 5230.0 + restart_temp_ratio: 2.0e-5 + visit: 2.62 + accept: -5.0 + maxfun: 10000000.0 + no_local_search: false + x0: + NormalDist::mu: 25. + NormalDist::sigma: 2.5 + + scipy_shgo: + like: LogLike + plugin: scipy_shgo + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + split_param_space: + NormalDist::mu: 2 + NormalDist::sigma: 2 + n: 100 + iters: 1 + sampling_method: "sobol" # "simplicial", "halton", "sobol" + minimizer_kwargs: + method: "SLSQP" # "SLSQP" "L-BFGS-B" + options: + ftol: 1e-12 + options: + # maxfev: + # f_min: + # f_tol: + # maxiter: + # maxev: + # maxtime: + # minhgrd: + # jac: false # Buggy in some scipy versions: https://github.com/scipy/scipy/issues/14533 + minimize_every_iter: true + local_iter: false + infty_constraints: true + disp: false + + scipy_minimize: + like: LogLike + plugin: scipy_minimize + # The run arguments below have been tested with scipy v1.9. + # Other versions might expect different arguments. + run: + # n_runs: 5 + x0: + NormalDist::mu: 25. + NormalDist::sigma: 2.5 + method: "L-BFGS-B" # "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP", "trust-constr", + # jac: "2-point" # "2-point", "3-point", "cs" + # hess: "2-point" # "2-point", "3-point", "cs" + tol: 1e-6 + options: + maxiter: 15000 + disp: false + + static_dynesty: + like: LogLike + pkg: gambit_dynesty + plugin: static_dynesty + pkl_name: "static_dynesty.pkl" + init: + nlive: 1000 + run: + dlogz: 0.5 + checkpoint_every: 60 + + dynamic_dynesty: + like: LogLike + plugin: dynamic_dynesty + pkg: gambit_dynesty + pkl_name: "dynamic_dynesty.pkl" + init: + nlive: 1000 + run: + dlogz_init: 0.5 + n_effective: 10000 + checkpoint_every: 60 + + nessai: + like: LogLike + plugin: nessai_flow_sampler + pkg: gambit_nessai + #init: + #output: "nessai_log_dir" + #logger: true + + nautilus: + like: LogLike + plugin: nautilus + #pkg: gambit_nautilus + run: + verbose: true + + zeus: + like: LogLike + pkg: gambit_zeus + plugin: zeus + init: + nwalkers: 8 + #run: + #nsteps: 20000 + #filename: "zeus.h5" + SaveProgressCallback: + filename: zeus.h5 + ncheck: 100 + + emcee: + like: LogLike + plugin: emcee + pkg: gambit_emcee + init: + nwalkers: 8 + run: + nsteps: 20000 + #filename: "emcee.h5" + + pocomc: + like: LogLike + plugin: pocomc + #pkg: gambit_pocomc + #init: + # nparticles: 1000 + + ultranest: + like: LogLike + pkg: gambit_ultranest + plugin: reactive_ultranest + #init: + run: + min_num_live_points: 1000 + dlogz: 0.5 + + binminpy: + like: LogLike + plugin: binminpy + run: + n_bins: + # For any parameters not listed under 'n_bins' binminpy will assume a single bin + NormalDist::mu: 100 + NormalDist::sigma: 100 + # sampled_parameters: ["NormalDist::mu", "NormalDist::sigma"] # By default parameters are optimized within each bin + sampler: "latinhypercube" + optimizer: "minimize" + optimizer_kwargs: + method: "L-BFGS-B" + tol: 1e-4 + n_initial_points: 10 + n_sampler_points_per_bin: 10 + # accept_loglike_above: -30.0 + accept_delta_loglike: 6.5 + neighborhood_distance: 2 + inherit_best_init_point_within_bin: False + n_optim_restarts_per_bin: 1 + n_tasks_per_batch: 1 + print_progress_every_n_batch: 100 + + +ObsLikes: + + - purpose: LogLike + capability: normaldist_loglike + module: ExampleBit_A + type: double + + #- purpose: Observable + # capability: normaldist_loglike + # module: ExampleBit_A + # type: double + # critical: true + +Rules: + + - if: + capability: normaldist_loglike + then: + options: + probability_of_validity: 1.0 + +Logger: + + redirection: + [Default] : "default.log" + [ExampleBit_A] : "ExampleBit_A.log" + [Scanner] : "Scanner.log" + debug: true + +KeyValues: + + debug: false #true + + default_output_path: "${PWD}/runs/spartan" + + # An additional entry in the dataset and metadata, useful for identifying which + # points correspond to a given scan in combined datasets. + # The default is for print_scanID: true, + # and for it to print the date and time as an int of the form + # scanID: HourMinuteSecondMillisecond. This can be overwritten to any integer. + print_scanID: true + scanID: 1 + + rng: + generator: ranlux48 + seed: -1 + + print_timing_data: true + + print_unitcube: true + + likelihood: + model_invalid_for_lnlike_below: -1e6 + + # A 'likelihood modifier function' recieves as input the total + # log-likelihood value and outputs a modified log-likelihood which + # is then passed to the scanner. This can be used to make an adaptive + # scanner explore specific ranges of the total log-likelihood, e.g. + # log-likelihood values corresponding to a given 1D/2D confidence region. + # The default is to use the 'identity' modifier, which does nothing. + use_lnlike_modifier: identity + lnlike_modifiers: + # Assuming that the best-fit log-likelihood value is 0.0, + # the 'gaussian' or 'gaussian_plateau' settings below + # will encourage the scanner to explore parameter regions + # at the border of the 2-sigma confidence region in 2D + # (Delta lnlike = -3.09). + gaussian: + mu: -3.1 + sigma: 0.5 + # use_limit: lower + use_delta_lnlike: false + gaussian_plateau: + mu_dn: -3.2 + sigma_dn: 0.5 + mu_up: -3.0 + sigma_up: 3.0 + use_delta_lnlike: false From d292d2460555863ee85958d67a33f91c93bdcc4c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 16:27:29 +0000 Subject: [PATCH 10/24] [hdf5_v1] Default-initialise hid_t members to -1 to harden aux printer paths The HDF5Printer class has seven hid_t members for HDF5 file / group / location handles: file_id, group_id, RA_group_id, metadata_id, location_id, RA_location_id, metadata_location_id The four file/group handles are only assigned by the *primary* constructor; the aux constructor at hdf5printer.cpp:683-699 only sets the three *_location_id members (inheriting them from primary_printer). With no in-class initialisers, the four file/group handles held indeterminate values on every auxiliary printer instance. This was the root cause of issue #495 (H5Fflush(file_id, ...) firing on aux printers when MultiNest's dumper called flush() on its 'txt' / 'live' aux streams). That specific call site is now correctly routed through primary_printer->file_id (merge of upstream fix_hdf5v1_and_multinest_issue), but the underlying class invariant remained fragile: any future aux call site that forgets to go through primary_printer would silently read uninitialised memory. Fix: give all seven hid_t members an in-class default initialiser of -1. This matches the sentinel value the class already treats as "unset HDF5 handle" -- the existing get_location() / get_RA_location() / get_metadata_location() checks at hdf5printer.cpp:1144,1155,1166 already raise a printer_error when they see -1. With this change the class invariant becomes uniform: an unset HDF5 handle in this class is -1, period. Any accidental HDF5 call on an aux's file/group handle now passes H5I_INVALID_HID, which HDF5 reliably rejects -- failures will surface deterministically as printer_errors instead of as indeterminate behaviour or random "lucky" successes. The bottom three *_location_id members are assigned by both constructors today, so the = -1 there is purely defensive. Including them keeps the rule uniform across all seven handles. Verification ------------ Rebuilt and ran four scenarios cleanly (no HDF5 errors, all exit 0): - v1 + Diver, single rank - v1 + Diver, 2-rank MPI - v1 + MultiNest, single rank (was the original #495 reproducer) - v2 + Diver, single rank (regression check) https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../gambit/Printers/printers/hdf5printer.hpp | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index f6acf1b267..3008684bd1 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -332,17 +332,26 @@ namespace Gambit std::string group; // HDF5 group location to store datasets std::string metadata_group; // HDF5 group location to store metadata - // Handles for HDF5 files and groups containing the datasets - hid_t file_id; - hid_t group_id; - hid_t RA_group_id; - hid_t metadata_id; + // Handles for HDF5 files and groups containing the datasets. + // Default-initialised to -1 (the convention this class already uses + // as the "unset HDF5 handle" sentinel; see e.g. the get_location() / + // get_RA_location() / get_metadata_location() checks). Only the + // primary printer's constructor assigns the file/group handles + // (file_id, group_id, RA_group_id, metadata_id); aux instances + // inherit the *_location_id values from the primary. The default + // initialisers ensure that any code path that mistakenly reads one + // of these on an aux instance gets a deterministic invalid handle + // rather than indeterminate memory. + hid_t file_id = -1; + hid_t group_id = -1; + hid_t RA_group_id = -1; + hid_t metadata_id = -1; // Handle to a location in a HDF5 to which the datasets will be written // i.e. a file or a group. - hid_t location_id; - hid_t RA_location_id; - hid_t metadata_location_id; + hid_t location_id = -1; + hid_t RA_location_id = -1; + hid_t metadata_location_id = -1; /// Pointer to the primary printer object // (if this is an auxilliary printer, else it is "this" //NULL) From ff1955ae560612790b93aa3f07ad68dc9c17a319 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 16:32:56 +0000 Subject: [PATCH 11/24] [hdf5_v2] Close HDF5 type handle in HDF5DataSet::write_buffer (handle leak) In HDF5DataSet::write_buffer (the v2 hdf5 printer's per-flush write path for synchronised datasets), the on-disk dtype is queried with hid_t dtype = H5Dget_type(get_dset_id()); and used only for an H5Tequal sanity check against the in-memory expected_dtype. The handle was never closed before the function returned, leaking one HDF5 type handle per call. write_buffer is invoked once per flush per synchronised dataset, so on long scans with many datasets and frequent flushes the leak accumulates -- eventually HDF5's internal table or the OS file-handle limit gives, depending on the build/runtime. The companion write_RA_buffer (lines 438/467 in the same header) already closes its corresponding H5Dget_type handle, so this is purely a missing close in the synchronised path. Fix: H5Tclose(dtype) immediately after the H5Tequal check, where dtype stops being needed. Mirrors the close site in write_RA_buffer. A short comment notes why it's there so the next reader doesn't "clean it up" again. Verification ------------ - Built cleanly. - v2 + Diver, single rank: exit 0, no HDF5 errors, output file has 34 datasets all sized 9002. - v2 + Diver, 2 MPI ranks: exit 0, no HDF5 errors. (The leak itself is slow-burn -- not visible in a short spartan run -- but the fix is mechanical and the smoke tests confirm it doesn't disturb the write path.) https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp index 33a177b931..f5f40cd923 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp @@ -308,6 +308,11 @@ namespace Gambit errmsg << "Error! Tried to write to dataset (name="< Date: Mon, 4 May 2026 17:07:51 +0000 Subject: [PATCH 12/24] [hdf5_v1] Clarify why buffer / dataset destructors are intentionally empty The destructors of DataSetInterfaceBase and VertexBufferNumeric1D_HDF5 each carried a TODO suggesting that one day they ought to close their HDF5 dataset / flush their pending writes, with the actual cleanup calls left commented out. The TODOs implied the empty body was a hack waiting to be fixed, but in fact the empty body is required by how these objects are stored: - DataSetInterfaceBase (and DataSetInterfaceScalar) are populated via the assign-from-temporary pattern, e.g. _dsetdata = DataSetInterfaceScalar(loc, name, ...); local_buffers[key] = BuffType(loc, label, vertexID, ...); With the default copy/move semantics, the temporary and the surviving slot share the same hid_t dset_id; closing in the destructor would invalidate the handle that the surviving copy still relies on. - The same shape applies to VertexBufferNumeric1D_HDF5 itself, whose destructor would similarly run on a temporary that shares all the buffer state (HDF5 handles, sync counters, write queues) with the container slot. Doing write_to_disk() / RA_write_to_disk() there would either double-write or write through stale state. The actual close / flush is driven deterministically from HDF5Printer::flush() and HDF5Printer::finalise(), iterating over the surviving buffers in the printer's all_buffers map. If finalise() is bypassed (exception during scan setup or shutdown), HDF5's own H5Fclose() at process exit auto-closes any still-open dataset handles and flushes their pending I/O, so the file on disk remains well-formed. This commit replaces the misleading TODOs with comments explaining the actual ownership invariant, so the next reader doesn't try to "fix" the empty body and reintroduce the original handle-aliasing bug. Properly closing in the destructor would require giving these classes ownership semantics they currently don't have (e.g. wrapping the hid_t in a shared_ptr with an H5Dclose deleter, or making the chain non-copyable / move-only) -- a larger change that is out of scope here. No behaviour change. Verified by rebuilding and running the spartan example with both the v1 and v2 hdf5 printer (both exit 0, no HDF5 errors). https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../hdf5printer/DataSetInterfaceBase.hpp | 32 +++++++++++-------- .../VertexBufferNumeric1D_HDF5.hpp | 24 +++++++++----- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp index 17d4b01521..e5b0a8233b 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp @@ -184,23 +184,27 @@ namespace Gambit { for(std::size_t i=0; i(loc, name, ...); + /// local_buffers[key] = BuffType(loc, label, vertexID, ...); + /// In each case a temporary is constructed, then copy-/move-assigned + /// into a member or container slot, then destroyed. With the default + /// copy semantics both objects hold the same hid_t dset_id, so + /// closing in the destructor would invalidate the handle held by the + /// surviving copy. The actual close therefore lives in the explicit + /// closeDataSet() method below, which is driven by + /// VertexBufferNumeric1D_HDF5::finalise() during the printer's + /// finalise() pass. If finalise() is bypassed (e.g. an exception + /// during scan setup or shutdown), HDF5's own H5Fclose() at process + /// exit will auto-close any still-open dataset handles and flush + /// their pending I/O, so the file on disk remains well-formed. template DataSetInterfaceBase::~DataSetInterfaceBase() { - // TODO: Having problems with copied objects sharing dataset identifiers, and closing datasets prematurely on each other. - // To fix, will probably need to have a fancy copy constructor or something. Or wrap datasets in an - // object which itself has a fancy copy constructor. For now, just leave dataset resources lying around, - // probably won't cause any issues. - // Or could explicity tell interface to close datasets before the objects are destroyed. - //if(this->dset_id>=0) - //{ - // herr_t status = H5Dclose(this->dset_id); - // if(status<0) - // { - // logger() << LogTags::printers << LogTags::err < inside H5P_LocalBufferManager) and + /// are populated via the assign-from-temporary pattern at + /// H5P_LocalBufferManager::get_buffer, so each buffer object's + /// destructor runs at least once on a temporary that shares state + /// (HDF5 dataset handles, sync counters) with the surviving + /// container slot. Triggering write_to_disk() / RA_write_to_disk() + /// here would therefore either double-write or write through stale + /// state, depending on which copy is being destroyed. End-of-scan + /// flushing is handled deterministically by HDF5Printer::flush() + /// and HDF5Printer::finalise() instead, which iterate over the + /// surviving buffers in the printer's all_buffers map. template - VertexBufferNumeric1D_HDF5::~VertexBufferNumeric1D_HDF5() + VertexBufferNumeric1D_HDF5::~VertexBufferNumeric1D_HDF5() { - //TODO: Do this in some more controlled way - //if(this->is_synchronised()) { write_to_disk(); } - //else - //{ - // RA_write_to_disk(); - //} } // Print out report on buffer sync status From 9aa6166dae68b8e5c44dc19ccb5b2d37be74a83a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 17:51:33 +0000 Subject: [PATCH 13/24] [hdf5_v1] Replace bare exit(1) in printer ctor with printer_error().raise() In HDF5Printer::common_constructor (Printers/src/printers/hdf5printer/ hdf5printer.cpp), the rank-0 setup block at lines 414-472 handles the case where the requested output file already exists. Inside the "restart, not resume, not overwrite" branch (lines 448-469), the code opens the file, checks whether the requested group is already present, and either: - group exists -> raise a printer_error with a clear, multi-option message explaining how to fix the situation (line 465); or - group missing -> closeFile + bare exit(1) (line 468), with no message at all. The exit(1) path was buggy in two ways: 1. exit(1) bypasses MPI shutdown -- no MPI_Finalize, no MPI_Abort. Worker ranks (which never enter this rank-0-only block) sail on to the upcoming myComm.Barrier() at line 610 and hang forever. The user sees a job that never terminates, with no log message explaining why. 2. The user has no idea what went wrong: the path triggers when they pointed the printer at a file that already exists from some unrelated previous run, and the only signal is a non-zero exit code from rank 0. Fix: mirror the sibling group-exists case at line 465. Build a clear printer_error message ("file exists, requested group not in it, not in resume mode, delete_file_on_restart not set; here are the three things you can do") and raise it. printer_error().raise() throws an exception that goes through GAMBIT's normal error-reporting machinery, which at least tells the user what happened on rank 0 before tearing down -- a strict improvement over silent exit(1). Note that this commit does *not* address the broader rank-0-error- deadlocks-workers problem: if any of the four printer_error sites in this constructor block (lines 438, 445, 465, and now this one) fire on rank 0, worker ranks still hang at the Barrier downstream. That deadlock-class issue is the same one tracked under "v1 masterWaitForAll(FINAL_SYNC) deadlocks" and "Same risk on the resume-mode myComm.Barrier() at line 610" in hdf5_printer_issues.md, and needs a separate, larger fix using BarrierWithTimeout. The point of the present commit is just to bring this one site up to the same error-reporting standard as its three siblings. Verification ------------ - Built cleanly. - Regular v1 spartan run: exit 0, no change to the happy path. - Reproduced the failing scenario by seeding an output file with group "/data" via a normal run, then re-running with the same output_file but group "/data2", restart mode (-rf), no delete_file_on_restart. Before the fix this would silent-exit(1); after the fix the job terminates with a FATAL ERROR carrying the full guidance message and "Calling MPI_Finalize..." in the trailer. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- Printers/src/printers/hdf5printer/hdf5printer.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Printers/src/printers/hdf5printer/hdf5printer.cpp b/Printers/src/printers/hdf5printer/hdf5printer.cpp index 0f861b3f7c..77a099393f 100644 --- a/Printers/src/printers/hdf5printer/hdf5printer.cpp +++ b/Printers/src/printers/hdf5printer/hdf5printer.cpp @@ -465,7 +465,20 @@ namespace Gambit printer_error().raise(LOCAL_INFO, errmsg.str()); } HDF5::closeFile(file_id); - exit(1); + // Group does not exist in the pre-existing file. Refuse to + // proceed: we are not in resume mode and 'delete_file_on_restart' + // is not set, so adding a new group to a file from an unrelated + // previous run risks silently mixing unrelated data. Raise a + // printer_error with actionable advice rather than calling + // bare exit(1) (which bypassed MPI shutdown and left worker + // ranks hanging at the upcoming Barrier with no error + // message). + std::ostringstream errmsg2; + errmsg2 << "Error preparing pre-existing output file '"< Date: Mon, 4 May 2026 18:27:54 +0000 Subject: [PATCH 14/24] [hdf5_v2] Delete unused MPI_flush_to_rank / MPI_recv_all_buffers / MPI_request_buffer_data path The v2 hdf5 printer carried two parallel sets of MPI machinery for shipping buffer data from worker ranks to the rank-0 master: 1. The currently-used "gather and Gatherv" path in finalise() and flush(), built around HDF5MasterBuffer::add_to_buffers and the HDF5bufferchunk message format. This is what actually runs in every MPI scan today. 2. A second, unwired Send/Recv path consisting of: - HDF5MasterBuffer::MPI_flush_to_rank - HDF5MasterBuffer::MPI_request_buffer_data - HDF5MasterBuffer::MPI_recv_all_buffers - HDF5MasterBuffer::MPI_recv_buffer - HDF5Buffer::MPI_flush_to_rank - HDF5Buffer::MPI_recv_from_rank - HDF5BufferBase::MPI_flush_to_rank (pure-virtual decl) plus eight dedicated MPI tag constants (h5v2_bufname / h5v2_bufdata_points / h5v2_bufdata_ranks / h5v2_bufdata_valid / h5v2_bufdata_type / h5v2_bufdata_values / h5v2_BLOCK / h5v2_BEGIN) and some commented-out recv_counter / send_counter debug scaffolding. The path in (2) is internally consistent (the functions only call each other) but has no external callers anywhere in the codebase, and the MPI_request_buffer_data side sends an h5v2_BEGIN tag for which there is no matching Recv -- a strong sign the path was abandoned mid-wiring rather than being a finished alternative. This commit deletes (2) outright. Two reasons for choosing delete over finishing the wiring: - The currently-used gather_and_print path is correct and tested (passes our spartan smoke tests in single-rank and 2-rank MPI modes, including with auxiliary RA streams). - The send-side comment in MPI_recv_all_buffers explicitly warns of a deadlock if MPI_request_buffer_data is not called first; revisiting that ordering and the request/recv handshake to make it deadlock-free would be a real design effort, not a hygiene fix. The audit recommended either deleting or wiring up; given there is no concrete reason to believe the unfinished path would outperform the working one, deletion is the lower-risk choice. Net change: -266 lines / +4 lines across hdf5printer_v2.hpp and hdf5printer_v2.cpp. add_to_buffers, MPI_add_int_block_to_buffer, MPI_add_float_block_to_buffer (which ARE used by the live path) are untouched. Verification ------------ - Built cleanly. - v2 + Diver, single rank: exit 0, output has 34 datasets all sized 9001 (uniform, as expected). - v2 + Diver, 2 MPI ranks: exit 0, output has 34 datasets all sized 9001 (uniform). https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../Printers/printers/hdf5printer_v2.hpp | 160 +----------------- .../hdf5printer_v2/hdf5printer_v2.cpp | 110 ------------ 2 files changed, 4 insertions(+), 266 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp index f5f40cd923..a6f996d1cc 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp @@ -68,10 +68,6 @@ namespace Gambit typedef long long longlong; typedef unsigned long long ulonglong; - // DEBUG h5v2_BLOCK message counters - //static int recv_counter; - //static int send_counter; - /// Length of chunks in chunked HDF5 dataset. Affects write/retrieval performance for blocks of data of various sizes. /// It is set to an "intermediate" sort of size since that seems to work well enough. static const std::size_t HDF5_CHUNKLENGTH = 100; @@ -82,22 +78,10 @@ namespace Gambit /// Largest allowed size of buffers. Size can be dynamically set from 1 to this number. static const std::size_t MAX_BUFFER_SIZE = 100000; - /// MPI tags for HDF5 printer v2 - const int h5v2_bufname(10); - const int h5v2_bufdata_points(11); - const int h5v2_bufdata_ranks(12); - const int h5v2_bufdata_valid(13); - const int h5v2_bufdata_type(14); - const int h5v2_bufdata_values(15); - // Message block end tag: - // 1 means "there is another block of messages to receive" - // 0 means "there are no more blocks of messages to receive" - const int h5v2_BLOCK(30); - // "Begin sending data" tag - const int h5v2_BEGIN(31); - - // The 'h5v2_bufdata_type' messages send an integer encoding - // the datatype for the h5v2_bufdata_values messages + // The integer-encoded type ID for buffer-data messages + // (formerly used by h5v2_bufdata_type / h5v2_bufdata_values; the + // type encoding itself is still in use by the gather_and_print + // path via HDF5bufferchunk and add_to_buffers). // Need a unique integer for each type. We can encode these // with a template function: @@ -643,12 +627,6 @@ namespace Gambit // int version virtual std::pair,std::vector> flush_to_vector_int(const std::vector& order) = 0; -#ifdef WITH_MPI - /// Send buffer contents to another process - virtual void MPI_flush_to_rank(const unsigned int r) = 0; - -#endif - /// Make sure datasets exist on disk with the correct name and size virtual void ensure_dataset_exists(const hid_t loc_id, const std::size_t length) = 0; @@ -910,102 +888,6 @@ namespace Gambit return buffer.size(); } -#ifdef WITH_MPI - // Send buffer contents to a different process - void MPI_flush_to_rank(const unsigned int r) - { - if(buffer.size()>0) - { - // Get name of the dataset this buffer is associated with - std::string namebuf = dset_name(); - // Copy point data and values into MPI send buffer - std::vector pointIDs; - std::vector ranks; // Will assume all PPIDpairs are valid. I think this is fine to do... - std::vector valid; // We have to send the invalid points too, to maintain buffer synchronicity - std::vector values; - int type(h5v2_type()); // Get integer identifying the type of the data values - int more_buffers = 1; // Flag indicating that there is a block of data to receive - for(auto it=buffer.begin(); it!=buffer.end(); ++it) - { - pointIDs.push_back(it->first.pointID); - ranks .push_back(it->first.rank); - valid .push_back(buffer_valid.at(it->first)); - values .push_back(it->second); - } - - // Debug info - #ifdef HDF5PRINTER2_DEBUG - logger()< pointIDs(Npoints); - std::vector ranks(Npoints); - std::vector valid(Npoints); - std::vector values(Npoints); - - // Receive buffer data - myComm.Recv(&values[0] , Npoints, r, h5v2_bufdata_values); - myComm.Recv(&pointIDs[0], Npoints, r, h5v2_bufdata_points); - myComm.Recv(&ranks[0] , Npoints, r, h5v2_bufdata_ranks); - myComm.Recv(&valid[0] , Npoints, r, h5v2_bufdata_valid); - - // Pack it into this buffer - #ifdef HDF5PRINTER2_DEBUG - logger()<& buffer = get_buffer(dset_name, buffered_points); - //std::cout<<"(rank "< void MPI_add_int_block_to_buffer(const HDF5bufferchunk& chunk, const std::string& dset_name, const std::size_t dset_index) diff --git a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp index 7f40fea80d..c882b260a2 100644 --- a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp +++ b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp @@ -660,108 +660,6 @@ namespace Gambit } #ifdef WITH_MPI - /// Send buffer data to a specified process - void HDF5MasterBuffer::MPI_flush_to_rank(const unsigned int r) - { - for(auto it=all_buffers.begin(); it!=all_buffers.end(); ++it) - { - it->second->MPI_flush_to_rank(r); - } - buffered_points.clear(); - buffered_points_set.clear(); - } - - /// Give process r permission to begin sending its buffer data - // Don't do this for all processes at once, as MPI can run out of - // Recv request IDs behind the scenes if thousands of processes are - // trying to send it lots of stuff at once. But it is good to let - // many processes start sending data before we need it, so that the - // Recv goes quickly (and is effectively already done in the background) - // when we get to it. - void HDF5MasterBuffer::MPI_request_buffer_data(const unsigned int r) - { - logger()<< LogTags::printers << LogTags::info << "Asking process "<(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - //case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - //case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - case h5v2_type(): Npoints = MPI_recv_buffer(r, dset_name); break; - default: - std::ostringstream errmsg; - errmsg<<"Unrecognised datatype integer (value = "<max_Npoints) max_Npoints = Npoints; - } - } - - logger()<second->N_items_in_buffer()<<" (name="<second->dset_name()<<")"<& blocks, const std::vector>& buf_types) { @@ -1882,10 +1780,6 @@ namespace Gambit // No distinction between final and early termination procedure for this printer. void HDF5Printer2::finalise(bool /*abnormal*/) { - // DEBUG h5v2_BLOCK message counter - //recv_counter = 0; - //send_counter = 0; - // The primary printer will take care of finalising all output. if(not is_auxilliary_printer()) { @@ -2026,10 +1920,6 @@ namespace Gambit } logger()<< LogTags::printers << LogTags::info << "HDF5Printer2 output finalisation complete."< Date: Mon, 4 May 2026 20:31:00 +0000 Subject: [PATCH 15/24] [hdf5_v1] Close H5Dget_type ids leaked in combine_tools hdf5_combine_tools.cpp called H5Dget_type at 8 sites to obtain the HDF5 datatype of an existing dataset (in order to recreate the dataset in the combined output via setup_hdf5_points), but never released the returned datatype id. H5Dget_type returns a fresh id that the caller owns and must close with H5Tclose; otherwise the id (and its underlying in-memory state) lingers until the file/library is shut down. The leaks were at three call sites within hdf5_stuff::Combine_HDF5: - Primary-dataset combine loop (~L949-956): type/type2 obtained per batch from either the previous combined dataset or a temp-file dataset, used by setup_hdf5_points only on batch 0, but allocated on every batch and never freed. Fixed by H5Tclose(type/type2) after the dataset_out/dataset2_out closes inside the same per-batch if(valid_dset>=0 or old_dataset>=0) block, so each iteration balances its allocations. - Auxiliary-dataset path (~L1144-1145): type/type2 obtained inside the "param not also a primary" branch, used immediately by setup_hdf5_points. Fixed by H5Tclose right after that call, guarded by >=0 since the variables are pre-initialised to -1 above. - Metadata-dataset combine loop (~L1273, L1278): single 'type' id, same pattern as the primary loop. Fixed by H5Tclose(type) after the dataset_out close inside the per-batch valid branch. These were straightforward resource leaks: setup_hdf5_points only reads 'type' to pass it to H5Dcreate2 and does not take ownership, so adding the closes at the end of the using-scope is correct and safe. Verification ------------ - Built cleanly (only Printers translation unit recompiled). - Ran spartan with the v1 (hdf5_v1) printer, 2 MPI ranks, with disable_combine_routines: false so the combine path actually exercises all three call sites. Run completed cleanly (exit 0): * 17 primary parameters combined, no errors * 51 metadata parameters combined, no errors * 0 auxiliary parameters in this scan (path not exercised, but that branch is structurally identical to the primary path) * combined output gambit_output.hdf5 contains uniform 9501-row 1-D datasets across all primary parameters. Out of scope (not fixed here) ----------------------------- - hdf5_combine_tools.hpp Enter_HDF5() at line 377 calls H5Tget_native_type(dtype, ...) and stores the result in 'type', then closes only 'dtype' (line 448) before returning. The native-type id is also a fresh id that needs H5Tclose. This is a separate leak (a different HDF5 API) and the issue title is scoped specifically to H5Dget_type; flagging here for follow-up. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp index 2d0469b23c..16b88fd171 100644 --- a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp @@ -992,6 +992,8 @@ namespace Gambit // Close resources HDF5::closeDataset(dataset_out); HDF5::closeDataset(dataset2_out); + H5Tclose(type); + H5Tclose(type2); // Move offset so that next batch is written to correct place in output file offset += batch_size_tot + pc_offset; @@ -1152,6 +1154,8 @@ namespace Gambit } // Create new dataset setup_hdf5_points(new_group, type, type2, size_tot, *it); + if(type>=0) H5Tclose(type); + if(type2>=0) H5Tclose(type2); } // Reopen output datasets for copying hid_t dataset_out = HDF5::openDataset(new_group, *it); @@ -1307,6 +1311,7 @@ namespace Gambit // Close resources HDF5::closeDataset(dataset_out); + H5Tclose(type); // Move offset so that next batch is written to correct place in output file offset += batch_size_tot + pc_offset; From 533d150060053ed1f7d85e4373124751460364dc Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 20:37:46 +0000 Subject: [PATCH 16/24] [hdf5_v1] Avoid undefined iterator decrement past begin() in combine size-trim loops Two near-identical loops in hdf5_stuff (one in the primary-dataset scanning pass, one in the auxiliary-dataset Enter_Aux_Parameters pass) walked a std::vector backwards to count the number of trailing invalid entries to trim from a per-file size: for (auto it = valids.end()-1; size > 0; --it) { if (*it) break; else --size; } There were two problems with this idiom: 1. If valids.size() == 0 (which happens cleanly when the temp file's sync group is empty), valids.end()-1 is undefined behavior. In practice the loop condition `size > 0` would skip the body, but the iterator is still constructed. 2. More importantly: when ALL entries in valids are invalid (so the "break" branch is never taken), the final iteration decrements size from 1 to 0 and dereferences valids.front() correctly, but then the for-loop's post-step runs `--it` once more, producing an iterator one before begin(). That decrement is undefined behavior on std::vector::iterator -- even though it's typically a pointer under the hood and "happens to work", a sanitizer or a debug iterator implementation can flag/abort. The condition `size > 0` is then evaluated on the already-invalid iterator's surrounding state, but the UB has already occurred at the --it. Replaced both loops with a straightforward index-based trim: while (size > 0 && !valids[size-1]) --size; Equivalent semantics (trim trailing invalids, keep first valid), but no iterator arithmetic, no past-the-begin decrement, and no empty-vector trap. Both call sites already guarantee size <= valids.size() (size and valids both come from the pointID_isvalid / RA_pointID_isvalid dataset, with the prior size != size2 check raising on mismatch), so valids[size-1] is safely in range whenever size > 0. Verification ------------ - Built cleanly (Printers TU only). - Spartan + hdf5_v1 + 2 MPI ranks + disable_combine_routines: false: exit 0, combined output has uniform 9000-row primary datasets and 1-row metadata datasets. The primary-pass trim loop runs once per temp file (2x in this scan); the aux-pass trim loop is structurally identical and exercised by the same Enter_HDF5/read_hdf5 path. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../hdf5printer/hdf5_combine_tools.cpp | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp index 16b88fd171..bf23eebd55 100644 --- a/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5_combine_tools.cpp @@ -536,13 +536,9 @@ namespace Gambit size_tot_l = size_tot + size; // Last? //} - for (auto it = valids.end()-1; size > 0; --it) - { - if (*it) - break; - else - --size; - } + // Trim any trailing invalid points (size <= valids.size() ensured above) + while (size > 0 && !valids[size-1]) + --size; HDF5::closeSpace(dataspace); HDF5::closeSpace(dataspace2); @@ -804,13 +800,9 @@ namespace Gambit ranks.push_back(valid_rank); ptids.push_back(valid_ptid); - for (auto it = valids.end()-1; size > 0; --it) - { - if (*it) - break; - else - --size; - } + // Trim any trailing invalid points (size <= valids.size() ensured above) + while (size > 0 && !valids[size-1]) + --size; aux_sizes.push_back(size); HDF5::closeSpace(dataspace); From 732330636d5cbb27d594212fe732e38c965ae5f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 21:14:05 +0000 Subject: [PATCH 17/24] [hdf5_v2] HDF5Printer2::flush() now also flushes registered aux_buffers Before this change, HDF5Printer2::flush() only invoked buffermaster.flush() on the calling printer instance. That meant a public flush() on the primary printer left every auxilliary printer's buffermaster -- metadata streams, RA streams, and any other aux streams the scan has registered -- with their points still in RAM until either the per-stream sync auto-flush triggered (when a sync buffer fills) or finalise() ran. The two known external callers of the public API (twalk's out_stream->flush() at the end of a temp-file replay batch and the Python scanner binding in py_module_scanbit.hpp) both expect "persist what I've written so far", so the partial flush was a behavioural mismatch with caller intent and a divergence from the v1 printer (HDF5Printer::flush() iterates all buffers). The fix: after flushing this->buffermaster, if this is the primary printer, iterate the aux_buffers list and flush each. Aux printers register their own buffermaster into the primary's aux_buffers in their constructor (~hdf5printer_v2.cpp:1342 add_aux_buffer) and have an empty aux_buffers themselves, so guarding the loop on not is_auxilliary_printer() preserves aux-printer semantics (unchanged: still just flushes its own buffermaster). Two passes -- sync streams first, RA streams second -- mirror the ordering already used in finalise() (~:1824-1833 then :1895-1909). Reason: RA writes anchor to positions established by sync writes, so flushing RA before its sync targets are on disk can leave RA points stuck in the postpone queue. What is intentionally NOT done here: * No MPI gather. The architecture defers cross-rank gathering to finalise() as a performance optimisation for HPC networked filesystems (see comment block at hdf5printer_v2.cpp:1786-1793). Mid-scan, every rank just writes its own buffered points to the shared file under the existing cross-rank Utils::FileLock taken inside HDF5MasterBuffer::flush() (lock_and_open_file -> hdf5out.get_lock() at :741). Adding an MPI collective inside the public flush() would deadlock for non-collective callers (twalk's per-rank flush, Python scanners) and is unnecessary for "persist my data" semantics. * No explicit H5Fflush. v1 keeps the file open between buffer flushes and ends flush() with H5Fflush(file_id, H5F_SCOPE_GLOBAL). v2's HDF5MasterBuffer::flush() always closes the file at the end via close_and_unlock_file -> H5Fclose, which already drains the HDF5 internal cache to disk, making H5Fflush redundant. * Auto-flush path unchanged. add_buffered_point's "buffer full" auto flush at hdf5printer_v2.hpp:1128 / 1137 calls HDF5MasterBuffer::flush() (same class), not HDF5Printer2::flush(). That path is untouched. * finalise() unchanged. It still does its own MPI gather and per-buffer flush logic without going through the public flush(). Cost: the new loop incurs N additional lock_and_open_file -> close_and_unlock_file cycles per public flush() call (N = number of registered aux buffers), but HDF5MasterBuffer::flush() short-circuits on get_Npoints() == 0 (:479) without taking the lock, so empty aux buffers cost essentially zero. flush() is rare on the hot path; this is fine. Verification ------------ - Built cleanly (Printers TU only). - Spartan + v2 + single rank: exit 0, 34 primary datasets all sized 9502 (uniform). - Spartan + v2 + 2 MPI ranks: exit 0, 34 primary datasets all sized 9003 (uniform). Both runs produce well-formed output; the change is additive and preserves existing on-disk layout because finalise() already flushes everything at end-of-scan -- this commit just makes the same flushing happen on demand when flush() is called. https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- .../hdf5printer_v2/hdf5printer_v2.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp index c882b260a2..b6cbf39f50 100644 --- a/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp +++ b/Printers/src/printers/hdf5printer_v2/hdf5printer_v2.cpp @@ -1773,7 +1773,26 @@ namespace Gambit // Flush data in buffers to disk void HDF5Printer2::flush() { + // Each rank writes its own buffered points under the cross-rank + // Utils::FileLock taken inside HDF5MasterBuffer::flush(); no MPI + // gather here. The collective gather happens only in finalise(). + // Auxilliary printers register their buffermaster into the primary's + // aux_buffers (see constructor) and have an empty aux_buffers + // themselves, so only the primary needs to iterate. + // Sync streams before RA streams: matches finalise()'s ordering, + // since RA writes target positions established by sync writes. buffermaster.flush(); + if(not is_auxilliary_printer()) + { + for(auto it=aux_buffers.begin(); it!=aux_buffers.end(); ++it) + { + if((*it)->is_synchronised()) (*it)->flush(); + } + for(auto it=aux_buffers.begin(); it!=aux_buffers.end(); ++it) + { + if(not (*it)->is_synchronised()) (*it)->flush(); + } + } } // Make sure printer output is fully on disk and safe From a4847e69d040f28eded31b17d46ddb57f050d9c9 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 5 May 2026 05:21:12 +0000 Subject: [PATCH 18/24] [hdf5_v1] Guard sync_pos==0 underflow in synchronise_buffers and get_buffer fast_forward HDF5Printer's sync_pos member is unsigned long, initialised to 0 (hdf5printer.hpp:416), and only ever incremented (in increment_sync_pos at hdf5printer.cpp:1650). Two call sites compute get_sync_pos()-1 without first checking that sync_pos>0, which would underflow to ULONG_MAX when sync_pos==0: 1. HDF5Printer::synchronise_buffers (hdf5printer.cpp:1320): const unsigned long sync_pos = get_sync_pos()-1; ... it->second->synchronise_output_to_position(sync_pos); The underflowed ULONG_MAX would be stored into each buffer's target_sync_pos. For RA buffers this later feeds extend_dset() in RA_write_to_disk, which would attempt to extend the dataset to ULONG_MAX rows. For sync buffers, the movediff calculation in synchronise_output_to_position relies on sync_pos+1 (ulong), which then wraps to 0 -- harmless only when dset_head_pos() also happens to be 0. 2. H5P_LocalBufferManager::get_buffer (hdf5printer.hpp:558): if(synchronised) it->second.fast_forward( printer->get_sync_pos()-1); fast_forward takes a signed long target_pos. With sync_pos==0, get_sync_pos()-1 == ULONG_MAX (ulong), implicitly converted to long it becomes -1 on two's-complement platforms. fast_forward then computes needed_steps = -1 - 0 = -1 (negative) and raises the "would need to move backwards" error (VertexBufferBase.hpp:218). In normal scans both bugs are dormant because of call ordering: - synchronise_buffers' first invocation (from the first check_for_new_point at :1632) happens before any buffer has been added to all_buffers, so the for-loop iterates over nothing and the underflowed value is never delivered. - fast_forward via get_buffer is reached only after check_for_new_point has run increment_sync_pos at :1650, taking sync_pos from 0 to 1, so by the time fast_forward executes sync_pos>=1 and target_pos>=0. The underflow is therefore real but masked by today's lifecycle. Adding a buffer earlier than the first check_for_new_point, or calling synchronise_buffers from a new code path before any pointID arrives, would surface either bug. Fix: guard each call site with sync_pos>0. Semantically, sync_pos==0 means "no point has yet been registered as the current sync position", so there is nothing to synchronise to or catch up to; returning early / skipping the call is the correct no-op behaviour and is consistent with buffers being at dset_head_pos==0 from construction. This is a surgical fix that preserves all existing behaviour: the two guarded paths were never executed with sync_pos==0 in practice, so guarding them doesn't change any observable behaviour today -- it only removes the latent underflow. Verification ------------ - Built cleanly. - Spartan + hdf5_v1 + 1 rank: exit 0, 34 primary datasets all sized 9000 (uniform). - Spartan + hdf5_v1 + 2 MPI ranks (with combine routines enabled): exit 0, 34 primary datasets all sized 9501 (uniform). https://claude.ai/code/session_01QZuy7gyVUMZa3FCXShjj4X --- Printers/include/gambit/Printers/printers/hdf5printer.hpp | 5 ++++- Printers/src/printers/hdf5printer/hdf5printer.cpp | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index 3008684bd1..c6a8acb1bc 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -555,7 +555,10 @@ namespace Gambit // We subtract one because another increment will happen after // the print statement (that triggered the creation of the new // buffer) completes. - if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); + // If sync_pos is still 0 there is no previous position to catch up + // to (and the buffer is already at dset_head_pos == 0 from + // construction), so skip the call to avoid an unsigned underflow. + if(synchronised and printer->get_sync_pos()>0) it->second.fast_forward(printer->get_sync_pos()-1); } if( it == local_buffers.end() ) diff --git a/Printers/src/printers/hdf5printer/hdf5printer.cpp b/Printers/src/printers/hdf5printer/hdf5printer.cpp index 77a099393f..13f530818f 100644 --- a/Printers/src/printers/hdf5printer/hdf5printer.cpp +++ b/Printers/src/printers/hdf5printer/hdf5printer.cpp @@ -1316,6 +1316,12 @@ namespace Gambit printer_error().raise(LOCAL_INFO, errmsg.str()); } + // If sync_pos is still 0 then no point has yet been registered as the + // current sync position, so there is no "previous slot" for buffers to + // catch up to. Return early to avoid the unsigned underflow that would + // otherwise occur in get_sync_pos()-1 below. + if(get_sync_pos()==0) return; + // Determine the desired sync position const unsigned long sync_pos = get_sync_pos()-1; From 6952d63f52f24e4f25fd8562bff4924ccb42c250 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Tue, 5 May 2026 08:57:00 +0200 Subject: [PATCH 19/24] Deleted temp files used for working on the hdf5 printers --- CLAUDE.md | 17 -- MINIMAL_BUILD.md | 176 ------------ hdf5_printer_issues.md | 195 ------------- yaml_files/spartan_issue_495.yaml | 456 ------------------------------ 4 files changed, 844 deletions(-) delete mode 100644 CLAUDE.md delete mode 100644 MINIMAL_BUILD.md delete mode 100644 hdf5_printer_issues.md delete mode 100644 yaml_files/spartan_issue_495.yaml diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 3b25d89c2f..0000000000 --- a/CLAUDE.md +++ /dev/null @@ -1,17 +0,0 @@ -# Build - -If you need to build/rebuild GAMBIT to perform some test, you can check -**[MINIMAL_BUILD.md](MINIMAL_BUILD.md)** for an end-to-end build recipe. -It covers system packages, Python setup, cmake flags, backend ordering, -smoke test, and timing. - - -# Run example - -``` -OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml -``` - -(For multi-process MPI runs, re-configure cmake with `-DWITH_MPI=True` -and use `mpiexec -np ./gambit ...`.) - diff --git a/MINIMAL_BUILD.md b/MINIMAL_BUILD.md deleted file mode 100644 index 2b2c58c7c7..0000000000 --- a/MINIMAL_BUILD.md +++ /dev/null @@ -1,176 +0,0 @@ -# Minimal GAMBIT build - -End-to-end recipe for a minimal GAMBIT build: ditching all the physics "Bits", not building any backends and only building a couple of scanners. - -The resulting `./gambit` binary can still be large and take several minutes to build -from scratch on 16 cores (the bottleneck is the single huge `Core/src/gambit.cpp` -translation unit). - -## 1. System packages - -``` -sudo apt-get update -sudo apt-get install -y \ - gfortran g++-12 gcc-12 \ - libeigen3-dev libgsl-dev libboost-all-dev libhdf5-dev \ - libblas-dev liblapack-dev pkg-config -``` - -Why each: -- `gfortran` — required (Fortran backends, BLAS interfaces). -- `g++-12 / gcc-12` — `cmake/backends.cmake` hard-codes - `--castxml-cc=g++-12` for the Prob3++ BOSS step (workaround for a - g++-13/clang-castxml conflict). Without g++-12 the prob3pp build - aborts at the BOSS/CastXML stage. -- `libeigen3-dev` — provides `/usr/include/eigen3`, used in the cmake - invocation below. -- `libgsl-dev`, `libboost-all-dev`, `libhdf5-dev` — runtime / build deps. -- `libblas-dev liblapack-dev` — without LAPACK, cmake's optional.cmake - errors out with "LAPACK shared library not found". - -### Optional: MPI - -To build GAMBIT with MPI support (required to run multi-process scans -with `mpiexec -np N ./gambit ...`, and to exercise any printer / scanner -code paths gated on `#ifdef WITH_MPI`), additionally install OpenMPI: - -``` -sudo apt-get install -y libopenmpi-dev openmpi-bin -``` - -This provides `mpiexec`, `mpicc`, `mpic++`, and the headers/libs that -cmake's MPI detection picks up. Then add `-DWITH_MPI=True` to the cmake -invocation in step 3. Verify the install with: - -``` -mpiexec --version # should report Open MPI 4.x -which mpic++ # /usr/bin/mpic++ -``` - -## 2. Python packages - -By default cmake picks a different interpreter (e.g. `/usr/local/bin/python3`, -3.11 in this sandbox) and a different runtime libpython (the system one, -e.g. `libpython3.10`). That mismatch forces you to install runtime Python -packages (numpy, numba, pandas, scipy, mpmath) **twice** — once for each -interpreter — or `./gambit -h` aborts with `ModuleNotFoundError: numpy`. - -Avoid this by pointing cmake at a single Python via `PYTHON_EXECUTABLE`, -`PYTHON_INCLUDE_DIR` and `PYTHON_LIBRARY` (see the cmake step below). Then -install the packages once for **that** interpreter: - -``` -pip install --user numpy numba pandas scipy mpmath -``` - -(In the Claude Code sandbox the chosen interpreter is -`/usr/local/bin/python3` = 3.11.15, so this `pip --user` install is the -one needed.) - -## 3. Configure (cmake) - -``` -mkdir -p build && cd build -cmake -DPYTHON_EXECUTABLE=/usr/local/bin/python3 \ - -DPYTHON_INCLUDE_DIR=/usr/include/python3.11 \ - -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.11.so \ - -DEIGEN3_INCLUDE_DIR=/usr/include/eigen3 \ - -DWITH_MPI=False \ - -DWITH_ROOT=False \ - -Ditch="ColliderBit;CosmoBit;DarkBit;NeutrinoBit;ObjectivesBit;FlavBit;DecayBit;SpecBit;PrecisionBit;Mathematica" \ - .. -``` - -Notes: -- `PYTHON_EXECUTABLE`/`PYTHON_INCLUDE_DIR`/`PYTHON_LIBRARY` make cmake use - the same Python (3.11 here) for both build-time scripts and the runtime - libpython — eliminating the dual pip install. Confirm by looking for - matching versions in the cmake output: - ``` - Using Python interpreter version 3.11.15 for build. - Using Python libraries version 3.11.15 for Python backend support. - ``` - If you see a `NOTE: You are using different Python versions for the - interpreter and the libraries!` message, the flags weren't picked up - (likely because `CMakeCache.txt` from a previous run is sticky — wipe - the build dir and rerun). -- `itch` lists modules to **exclude** from the build. - -- `WITH_MPI=False` and `WITH_ROOT=False` keep the build minimal. Set - `-DWITH_MPI=True` instead if you installed OpenMPI per the optional - step in section 1; cmake will then pick up `mpic++` automatically and - compile the `#ifdef WITH_MPI` branches in the printer / scanner code. - -- This first cmake call takes ~75 seconds (it clones pybind11, runs many - feature checks). - -## 4. Build scanners, then re-cmake, then build gambit - -``` -make diver -make multinest -cmake .. # re-detect newly built scanner libraries -make gambit -j$(nproc) -``` - -Expected timings on 16 cores: -- `diver`: ~1 min (mostly download) -- `multinest`: ~1 min (mostly download) -- `cmake ..`: ~75 s -- `make gambit -j16`: Can take ~25 min (gambit.cpp.o alone can take ~20 min) - -The whole chain runs cleanly start-to-finish if build is done in the order above. - -### Long-running build tip (Claude Code) - -The `Bash` tool defaults to a 10-minute timeout, but exceeding it does -**not** kill the command — the tool returns "running in background" and -the build continues. You can poll with subsequent `ps`/`tail` calls. Or -raise the cap explicitly via the `BASH_MAX_TIMEOUT_MS` env var in -`~/.claude/settings.json` (e.g. set to `3600000` for 60 min): - -```json -{ - "env": { "BASH_MAX_TIMEOUT_MS": "3600000" } -} -``` - -To watch a verbose log instead of the silent default, prepend `VERBOSE=1`: - -``` -make gambit -j$(nproc) VERBOSE=1 -``` - -## 5. Smoke test - -``` -./gambit --version -./gambit -h -./gambit -d -f yaml_files/spartan.yaml # dry-run, prints functor tree -``` - -A successful dry-run prints the resolved dependency tree. - -## 6. Run example - -Without MPI: - -``` -OMP_NUM_THREADS=1 ./gambit -rf yaml_files/spartan.yaml -``` - -With MPI (build configured with `-DWITH_MPI=True`): - -``` -OMP_NUM_THREADS=1 mpiexec -np 2 ./gambit -rf yaml_files/spartan.yaml -``` - -If `mpiexec` complains about running as root in a container, add -`--allow-run-as-root` (and, if you hit "not enough slots", also -`--oversubscribe`): - -``` -OMP_NUM_THREADS=1 mpiexec --allow-run-as-root --oversubscribe -np 2 \ - ./gambit -rf yaml_files/spartan.yaml -``` - diff --git a/hdf5_printer_issues.md b/hdf5_printer_issues.md deleted file mode 100644 index 77a2da70f0..0000000000 --- a/hdf5_printer_issues.md +++ /dev/null @@ -1,195 +0,0 @@ -# HDF5 printer audit — synthesized findings - -Audit of the v1 (`Printers/src/printers/hdf5printer/`) and v2 -(`Printers/src/printers/hdf5printer_v2/`) HDF5 printer implementations, -performed after the file_id fix for issue #495. Findings sorted by -severity and grouped where v1 and v2 share a class of bug. - -## Critical (likely to bite real runs) - -### [v2] `final_size` is uninitialised on workers in `HDF5Printer2::finalise()` -`hdf5printer_v2.cpp:1958-2010`. `final_size` is set only inside -`if(myRank==0)` at lines 1959-1967, but the loop at 1996-2010 runs on -**every** rank and calls `(*it)->extend_all_datasets_to(final_size)` at -line 2008. Workers pass garbage. There's no rank-0 guard around the -loop. This will misbehave any time a worker has a non-synchronised aux -buffer registered (which any scanner using RA aux streams over MPI -produces). The whole 1996-2010 block belongs inside `if(myRank==0)`, -alongside the matching block at 1930-1952. - -### [v1] `errorsOff()` / `errorsOn()` nesting permanently silences HDF5 errors -`hdf5tools.cpp:369-386`. `errorsOff()` saves the old handler into -module-globals `old_func` / `old_client_data`. The function at line 377 -calls `errorsOff()` again *while errors are already off* — so it saves -NULL into those globals. The matching `errorsOn()` at line 386 then -restores NULL, leaving HDF5 error reporting permanently disabled for the -rest of the process. Same pattern at lines 395/408 (less reachable). Any -subsequent HDF5 problem will fail silently. - -### [v1] Aux printers' uninitialised `hid_t` members beyond `file_id` -`hdf5printer.hpp:336-345`. None of `file_id`, `group_id`, `RA_group_id`, -`metadata_id` (and arguably the `*_location_id` set) have default -initialisers. The aux constructor only assigns `location_id` / -`RA_location_id` / `metadata_location_id`. The `H5Fflush` issue we just -fixed (#495) is one example — others are latent. Suggests a wider fix: -either default-init all to `-1` (HDF5's invalid handle), or always go -through `primary_printer->X` in code that runs on aux instances. - -### [v1] `openGroup` returns after creating only the first non-existent path component -`hdf5tools.cpp:540-574`. On the first iteration where `H5Gopen2` fails -and `H5Gcreate2` succeeds, the inner `else` branch at line 564 returns -immediately, exiting the loop before processing the remaining -components. Latent because v1 only ever opens single-level paths -(`/data`, `/data/RA`, etc. as separate calls), but the function would -silently mis-create any user-supplied deeper `group:` option. - -### [v1] Buffer destructors do not close their HDF5 datasets -`VertexBufferNumeric1D_HDF5.hpp:294-303`, -`DataSetInterfaceBase.hpp:189-204`. Destructors are deliberately empty -(close calls commented out, with TODO about copy semantics). Cleanup -relies entirely on `finalise()` running. Any abnormal exit before -finalise — a constructor exception elsewhere, an error during -synchronisation, an uncaught signal — leaks dataset handles and likely -loses unflushed data when `H5Fclose` auto-closes them later. - -### [v2] `H5Tclose` missing in `HDF5DataSet::write_buffer` — leaks one type handle per flush -`hdf5printer_v2.hpp:304-311`. `hid_t dtype = H5Dget_type(get_dset_id())` -is opened but never closed. The companion `write_RA_buffer` at lines -438/467 *does* close it. Long scans with many datasets and many flushes -accumulate handles until HDF5's table or the OS file-handle limit gives. - -### [v2] Stack-local `RAbuffer` added to member `aux_buffers`, then destroyed -`hdf5printer_v2.cpp:1973, 1994`. `HDF5MasterBuffer RAbuffer(...)` is a -local; `add_aux_buffer(RAbuffer)` pushes its address into the -`aux_buffers` vector. When `finalise` returns, the pointer dangles. -Currently dormant (no second `finalise` call, empty destructor at 1844) -but a bug-magnet for any future change. - -## Major - -### [v1] `masterWaitForAll(FINAL_SYNC)` deadlocks if any worker dies before reaching it -`hdf5printer.cpp:987` plus `Utils/src/mpiwrapper.cpp:163-188`. The -barrier is plain blocking `Recv`s on rank 0, no Iprobe-with-timeout -(unlike `allWaitForMasterWithFunc`). The TODO comment at line 988 -acknowledges this. Same risk on the resume-mode `myComm.Barrier()` at -line 610 if `prepare_and_combine_tmp_files` raises on rank 0. - -### [v1] `exit(1)` from inside the constructor after `closeFile` -`hdf5printer.cpp:467-469`. On any restart-without-`-r` against an -existing output file with the requested group missing — a normal user -mistake — rank 0 calls `exit(1)`, bypassing MPI shutdown. Workers -blocked on the upcoming `myComm.Barrier()` at line 610 hang forever. - -### [v2] `MPI_flush_to_rank` / `MPI_recv_all_buffers` / `MPI_request_buffer_data` are dead code -`hdf5printer_v2.cpp:664-810`, `hdf5printer_v2.hpp:910-1001`, including -`recv_counter` / `send_counter` debug scaffolding. No callers. -`finalise()` uses the `gather_and_print` / `Gatherv` path instead. -Either half-finished refactor leftover or a missed wiring that should be -using this faster path. Recommend either deleting or wiring up — the -deadlock-warning comment is currently misleading. - -### [v2] `HDF5Printer2::flush()` does not gather over MPI; only the calling rank's buffermaster is touched -`hdf5printer_v2.cpp:1876-1879`. So between scanner-driven `flush()` -calls, RA aux data on workers stays in worker memory until `finalise()`. -Different from v1's effective semantics. Any scanner that calls -`aux_stream->flush()` for durability (MultiNest's dumper at -`multinest_3.12/multinest.cpp:362-363` is exactly this) will *not* get -RA data from non-rank-0 ranks to disk until the scan ends. - -### [v1] `synchronise_buffers` underflows if `sync_pos == 0` -`hdf5printer.cpp:1307`. `const unsigned long sync_pos = get_sync_pos()-1;` -becomes `ULONG_MAX`. Currently never reached because `check_for_new_point` -only triggers it after a real point has been seen, but the invariant is -not enforced anywhere. - -### [v1] Unbounded growth of `postpone_write_queue_and_locs` -`VertexBufferNumeric1D_HDF5.hpp:63, 541-544`. RA writes that can't yet -be matched to a sync point get pushed without a cap. The MPI-based -queue-sending that was supposed to bound this has been disabled -(commented out) and the comments weren't updated. - -### [v1] `_print_metadata` always passes `resume=false` -`hdf5printer.cpp:1681-1691`. Re-creates datasets on every call. -Currently only called once, at `finalise()`, so dormant — but any new -caller will trip the "Dataset with same name may already exist" path -immediately. - -### [v1] Aux constructor doesn't validate `dynamic_cast` result -`hdf5printer.cpp:683`. If the cast fails (HDF5 aux attached to non-HDF5 -primary), `primary_printer` becomes nullptr and the very next -`set_resume(primary_printer->get_resume())` segfaults. - -### [v2] `print_metadata` underflow -`hdf5printer_v2.cpp:609-616`. `std::size_t size = get_next_metadata_position(); if(sameset) size --;` -— if the GAMBIT dataset doesn't exist yet, `get_next_metadata_position()` -returns 0, `--` underflows to `SIZE_MAX`. Currently protected by the -call-site `if (use_metadata)` at 1948, but the function is unsafe. - -### [v1] Iterators decremented past `begin()` in combine size-trim loops -`hdf5_combine_tools.cpp:539-545`, `807-813`. Technically UB on -`std::vector::iterator` when all entries are false. Works on most STLs. - -### [v1] `H5Dget_type` not closed in combine tools -`hdf5_combine_tools.cpp:949-950, 955-956, 1144-1145, 1273, 1278`. -Type-handle leak per file/parameter combined; contributes to "too many -open objects" warnings on big merges. - -## Minor / Notes - -A handful more across both files: - -- v1 dead/stale state members (`current_dset_position`, `previous_points`, - `global=false` field never assigned). -- v1 `errorsOff` / `errorsOn` use static globals → fundamentally not - thread-safe. -- v1 hardcoded `BUFFERLENGTH=100` and `MAX_PPIDPAIRS=1000`. -- v1 `combine_output_py` is dead code (uses a Python helper that may not - exist). -- v2 constructor's resource-leak risk if `printer_error()` raises - mid-setup (no RAII). -- v2 generous `std::cout` / `std::cerr` use where `logger()` would be - appropriate (e.g. lines 1514, 1539, 1543, 1705, 1713, 1716, 1745, - 1762, 1965, 2018). -- v2 narrowing conversions between `std::size_t` and `int` in MPI size - handling. -- Both v1 and v2 shell out to `popen("rm ...")` and `popen("cp ...")` - rather than calling `std::remove` / filesystem APIs. Brittle on paths - with shell metacharacters. -- v2 `MAX_BUFFER_SIZE = 100000` `T buffer[MAX_BUFFER_SIZE]` is a stack - array (~800 KB for `T=double`). Fine on Linux defaults, fragile - elsewhere. -- v2 `H5Tequal(in-memory, on-disk)` check can fire spuriously on resume - because on-disk types aren't normalised through `H5Tget_native_type` - like v1 does. -- v2 `_isvalid` companion existence not separately tracked from value - dataset (TODO at `hdf5printer_v2.hpp:888`). - -## What to prioritize fixing - -1. **v2 `final_size` / `extend_all_datasets_to` on workers** — same shape - as the bug we just fixed: a finalise-time member-not-set-on-aux-path - issue that hides because some scenarios mask the UB. Will hit any MPI - scan with v2 + RA aux streams. -2. **v1 `errorsOff` / `errorsOn` nesting** — silently disables your - safety net for HDF5 errors. Trivial to fix (save state in a local). -3. **v1 uninit `hid_t` cluster** — same root cause as #495. A small - refactor to default-init or to collapse aux/primary handles into a - single struct would prevent another year of "another field wasn't set - on aux printers" bugs. -4. **v2 `H5Tclose` leak in `write_buffer`** — slow-burn but unbounded. -5. **v1 buffer-destructor cleanup** — would matter on any abnormal - shutdown, including signal-driven ones from a scheduler killing a - job. - -Everything below "Major" is good to file as low-priority issues but -probably not urgent. - -## Theme - -A lot of the fragility across both printers comes from sharing state -between primary and auxiliary printer instances via raw pointers and -ad-hoc "use this one, not that one" rules. A refactor that makes the -auxiliary's role purely a *view* over a shared `HDF5File` object (so -there are no per-printer file/group handles to forget to initialise) -would eliminate this whole class of bug. Big change, but probably worth -scoping if you're touching this code anyway. diff --git a/yaml_files/spartan_issue_495.yaml b/yaml_files/spartan_issue_495.yaml deleted file mode 100644 index b87d435cb7..0000000000 --- a/yaml_files/spartan_issue_495.yaml +++ /dev/null @@ -1,456 +0,0 @@ -########################################################################## -## GAMBIT configuration for an ultra-minimal scan of a toy model. -## -## Only needs ExampleBit_A and a scanner (either one of the built-ins like -## the random sampler or TWalk, or an external like Diver, MultiNest, -## GreAT or Polychord). -########################################################################## - - -Parameters: - # In this example we will simply be fitting the mean and standard deviation of a normal distribution. - NormalDist: - # Example of using simple priors: - mu: - prior_type: flat - range: [15, 30] - sigma: - prior_type: flat - range: [0, 5] - - # If using a more complicated prior specified in the 'Priors' section below, - # use the 'in_priors' keyword in this section: - # mu: in_priors - # sigma: in_priors - - -Priors: - - # None needed: simple priors are already specified in the 'Parameters' section above. - - # If defining a more complicated prior, make sure to tag the relevant parameters with the - # keyword 'in_priors' in the 'Parameters' section above. Below is an example of a - # 2D gaussian prior for the two parameters 'mu' and 'sigma': - # - # my_prior: - # parameters: ["NormalDist::mu", "NormalDist::sigma"] - # prior_type: gaussian - # mu: [18, 5] - # cov_matrix: [[9.0, 0.0], [0.0, 1.0]] - # # sigma: [3.0, 1] - - -Printer: - - # printer: sqlite - # options: - # output_file: "results.sql" - # table_name: "data" - # buffer_length: 1000 - # delete_file_on_restart: true - - # printer: hdf5 - # options: - # output_file: "results.hdf5" - # group: "/data" - # delete_file_on_restart: true - # buffer_length: 1000 - # # disable_autorepair: true - - printer: hdf5_v1 - options: - output_file: "gambit_output.hdf5" - group: "/data" - delete_file_on_restart: true - disable_combine_routines: true - - # printer: ascii - # options: - # output_file: "results.dat" - # buffer_length: 10 - # delete_file_on_restart: true - # print_debug_data: true - - # printer: none - - -Scanner: - - use_scanner: multinest - - scanners: - - random: - plugin: random - like: LogLike - point_number: 1000 - - grid: - # plugin: pygrid - plugin: grid - #version: ">=1.0" - like: LogLike - grid_pts: [5, 3] - - de: - plugin: diver - like: LogLike - NP: 500 - initial_guesses: - parameters: [NormalDist::sigma, NormalDist::mu] - values: - - [2.5, 25] - - [2, 24] - - [1.5, 22] - - multinest: - plugin: multinest - like: LogLike - nlive: 500 - #tol: 0.0001 - tol: 0.1 - - mcmc: - plugin: great - like: LogLike - nTrialLists: 5 - nTrials: 10000 - - twalk: - plugin: twalk - like: LogLike - sqrtR: 1.001 - - polychord: - plugin: polychord - like: LogLike - print_parameters_in_native_output: true - tol: 0.1 - - pso: - plugin: jswarm - like: LogLike - NP: 400 - adaptive_phi: true - adaptive_omega: true - verbosity: 2 - - minuit2: - plugin: minuit2 - like: LogLike - tolerance: 0.0001 - precision: 0.0001 - max_loglike_calls: 100000 - max_iterations: 100000 - algorithm: combined # simplex, combined, scan, fumili, bfgs, migrad - print_level: 1 - strategy: 2 - start: - NormalDist::mu: 25. - NormalDist::sigma: 2.5 - step: - NormalDist::mu: 5. - NormalDist::sigma: 1. - - scipy_basin_hopping: - like: LogLike - plugin: scipy_basin_hopping - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - # n_runs: 4 - x0: - NormalDist::mu: 25. - NormalDist::sigma: 2.5 - niter: 100 - T: 1.0 - stepsize: 0.5 - minimizer_kwargs: - method: "L-BFGS-B" - interval: 50 - disp: true - target_accept_rate: 0.5 - stepwise_factor: 0.9 - - scipy_differential_evolution: - like: LogLike - plugin: scipy_differential_evolution - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - # n_runs: 4 - strategy: 'best1bin' - maxiter: 1000 - popsize: 15 - tol: 0.01 - mutation: [0.5, 1] - recombination: 0.7 - disp: false - polish: true - init: 'latinhypercube' - atol: 0 - updating: 'immediate' - x0: - NormalDist::mu: 25. - NormalDist::sigma: 2.5 - - scipy_direct: - like: LogLike - plugin: scipy_direct - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - # n_runs: 4 - eps: 0.0001 - # maxfun: 2000 - maxiter: 1000 - locally_biased: true - vol_tol: 1e-16 - len_tol: 1e-6 - - scipy_dual_annealing: - like: LogLike - plugin: scipy_dual_annealing - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - # n_runs: 4 - maxiter: 1000 - initial_temp: 5230.0 - restart_temp_ratio: 2.0e-5 - visit: 2.62 - accept: -5.0 - maxfun: 10000000.0 - no_local_search: false - x0: - NormalDist::mu: 25. - NormalDist::sigma: 2.5 - - scipy_shgo: - like: LogLike - plugin: scipy_shgo - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - split_param_space: - NormalDist::mu: 2 - NormalDist::sigma: 2 - n: 100 - iters: 1 - sampling_method: "sobol" # "simplicial", "halton", "sobol" - minimizer_kwargs: - method: "SLSQP" # "SLSQP" "L-BFGS-B" - options: - ftol: 1e-12 - options: - # maxfev: - # f_min: - # f_tol: - # maxiter: - # maxev: - # maxtime: - # minhgrd: - # jac: false # Buggy in some scipy versions: https://github.com/scipy/scipy/issues/14533 - minimize_every_iter: true - local_iter: false - infty_constraints: true - disp: false - - scipy_minimize: - like: LogLike - plugin: scipy_minimize - # The run arguments below have been tested with scipy v1.9. - # Other versions might expect different arguments. - run: - # n_runs: 5 - x0: - NormalDist::mu: 25. - NormalDist::sigma: 2.5 - method: "L-BFGS-B" # "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP", "trust-constr", - # jac: "2-point" # "2-point", "3-point", "cs" - # hess: "2-point" # "2-point", "3-point", "cs" - tol: 1e-6 - options: - maxiter: 15000 - disp: false - - static_dynesty: - like: LogLike - pkg: gambit_dynesty - plugin: static_dynesty - pkl_name: "static_dynesty.pkl" - init: - nlive: 1000 - run: - dlogz: 0.5 - checkpoint_every: 60 - - dynamic_dynesty: - like: LogLike - plugin: dynamic_dynesty - pkg: gambit_dynesty - pkl_name: "dynamic_dynesty.pkl" - init: - nlive: 1000 - run: - dlogz_init: 0.5 - n_effective: 10000 - checkpoint_every: 60 - - nessai: - like: LogLike - plugin: nessai_flow_sampler - pkg: gambit_nessai - #init: - #output: "nessai_log_dir" - #logger: true - - nautilus: - like: LogLike - plugin: nautilus - #pkg: gambit_nautilus - run: - verbose: true - - zeus: - like: LogLike - pkg: gambit_zeus - plugin: zeus - init: - nwalkers: 8 - #run: - #nsteps: 20000 - #filename: "zeus.h5" - SaveProgressCallback: - filename: zeus.h5 - ncheck: 100 - - emcee: - like: LogLike - plugin: emcee - pkg: gambit_emcee - init: - nwalkers: 8 - run: - nsteps: 20000 - #filename: "emcee.h5" - - pocomc: - like: LogLike - plugin: pocomc - #pkg: gambit_pocomc - #init: - # nparticles: 1000 - - ultranest: - like: LogLike - pkg: gambit_ultranest - plugin: reactive_ultranest - #init: - run: - min_num_live_points: 1000 - dlogz: 0.5 - - binminpy: - like: LogLike - plugin: binminpy - run: - n_bins: - # For any parameters not listed under 'n_bins' binminpy will assume a single bin - NormalDist::mu: 100 - NormalDist::sigma: 100 - # sampled_parameters: ["NormalDist::mu", "NormalDist::sigma"] # By default parameters are optimized within each bin - sampler: "latinhypercube" - optimizer: "minimize" - optimizer_kwargs: - method: "L-BFGS-B" - tol: 1e-4 - n_initial_points: 10 - n_sampler_points_per_bin: 10 - # accept_loglike_above: -30.0 - accept_delta_loglike: 6.5 - neighborhood_distance: 2 - inherit_best_init_point_within_bin: False - n_optim_restarts_per_bin: 1 - n_tasks_per_batch: 1 - print_progress_every_n_batch: 100 - - -ObsLikes: - - - purpose: LogLike - capability: normaldist_loglike - module: ExampleBit_A - type: double - - #- purpose: Observable - # capability: normaldist_loglike - # module: ExampleBit_A - # type: double - # critical: true - -Rules: - - - if: - capability: normaldist_loglike - then: - options: - probability_of_validity: 1.0 - -Logger: - - redirection: - [Default] : "default.log" - [ExampleBit_A] : "ExampleBit_A.log" - [Scanner] : "Scanner.log" - debug: true - -KeyValues: - - debug: false #true - - default_output_path: "${PWD}/runs/spartan" - - # An additional entry in the dataset and metadata, useful for identifying which - # points correspond to a given scan in combined datasets. - # The default is for print_scanID: true, - # and for it to print the date and time as an int of the form - # scanID: HourMinuteSecondMillisecond. This can be overwritten to any integer. - print_scanID: true - scanID: 1 - - rng: - generator: ranlux48 - seed: -1 - - print_timing_data: true - - print_unitcube: true - - likelihood: - model_invalid_for_lnlike_below: -1e6 - - # A 'likelihood modifier function' recieves as input the total - # log-likelihood value and outputs a modified log-likelihood which - # is then passed to the scanner. This can be used to make an adaptive - # scanner explore specific ranges of the total log-likelihood, e.g. - # log-likelihood values corresponding to a given 1D/2D confidence region. - # The default is to use the 'identity' modifier, which does nothing. - use_lnlike_modifier: identity - lnlike_modifiers: - # Assuming that the best-fit log-likelihood value is 0.0, - # the 'gaussian' or 'gaussian_plateau' settings below - # will encourage the scanner to explore parameter regions - # at the border of the 2-sigma confidence region in 2D - # (Delta lnlike = -3.09). - gaussian: - mu: -3.1 - sigma: 0.5 - # use_limit: lower - use_delta_lnlike: false - gaussian_plateau: - mu_dn: -3.2 - sigma_dn: 0.5 - mu_up: -3.0 - sigma_up: 3.0 - use_delta_lnlike: false From 07212109d08f1130ebba92b2a77e0b90c554a58f Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Tue, 5 May 2026 09:46:39 +0200 Subject: [PATCH 20/24] Clarified a code snippet and comment in hdf5printer.hpp --- .../gambit/Printers/printers/hdf5printer.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index c6a8acb1bc..d573469aa0 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -552,13 +552,14 @@ namespace Gambit // Force increment the buffer to "catch it up" to the current sync // position, in case it has been created "late". - // We subtract one because another increment will happen after - // the print statement (that triggered the creation of the new - // buffer) completes. - // If sync_pos is still 0 there is no previous position to catch up - // to (and the buffer is already at dset_head_pos == 0 from - // construction), so skip the call to avoid an unsigned underflow. - if(synchronised and printer->get_sync_pos()>0) it->second.fast_forward(printer->get_sync_pos()-1); + if (printer->get_sync_pos() > 0) // if 0, nothing to catch up to + { + // We subtract one because another increment will happen after + // the print statement (that triggered the creation of the new + // buffer) completes. (Subtraction is safe since get_sync_pos() > 0. + // Otherwise this would have lead to an underflow. + if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); + } } if( it == local_buffers.end() ) From f13198798706be81763e8edbd1400dee9fd1e003 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Tue, 5 May 2026 09:48:40 +0200 Subject: [PATCH 21/24] Fixed typos in hdf5printer.hpp --- Printers/include/gambit/Printers/printers/hdf5printer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index d573469aa0..a37c56d72e 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -557,7 +557,7 @@ namespace Gambit // We subtract one because another increment will happen after // the print statement (that triggered the creation of the new // buffer) completes. (Subtraction is safe since get_sync_pos() > 0. - // Otherwise this would have lead to an underflow. + // Otherwise this would have led to an underflow.) if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); } } From 0ba474e600c78aff8cc895de24bfa7d497cf202e Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Tue, 5 May 2026 09:51:15 +0200 Subject: [PATCH 22/24] Tweaked comment in hdf5printer_v2.hpp --- Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp index a6f996d1cc..07f04edaaa 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp @@ -294,8 +294,7 @@ namespace Gambit } // dtype is no longer needed after the equality check; close it // here to avoid leaking one HDF5 type handle per flush per - // dataset (write_RA_buffer below already does this at its - // matching H5Dget_type site). + // dataset. H5Tclose(dtype); std::size_t required_size = target_pos+length; From d1edc9c3b52cb3239071654135d5d79805cec3e7 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Tue, 5 May 2026 09:54:56 +0200 Subject: [PATCH 23/24] Tweaks to comment in hdf5tools.cpp --- Printers/src/printers/hdf5printer/hdf5tools.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Printers/src/printers/hdf5printer/hdf5tools.cpp b/Printers/src/printers/hdf5printer/hdf5tools.cpp index f47200a3ae..be934013c8 100644 --- a/Printers/src/printers/hdf5printer/hdf5tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5tools.cpp @@ -668,14 +668,14 @@ namespace Gambit { /// Close hdf5 group SIMPLE_CALL(hid_t, closeGroup, hid_t, H5Gclose, "close", "group", "group") - /// State for errorsOff() / errorsOn(). File-local: only the two - /// functions below should ever touch these. The bool guard ensures + /// State for errorsOff() / errorsOn(). The bool guard ensures /// that a redundant errorsOff() called while errors are already off /// does NOT overwrite the saved handler with the NULL handler we /// installed ourselves -- otherwise the matching errorsOn() would /// "restore" NULL and silently disable HDF5 error reporting for /// the rest of the process. - namespace { + namespace + { bool errors_are_off = false; H5E_auto2_t saved_func = nullptr; void *saved_client_data = nullptr; From 02d482af6b8408cb71156449a4893660b7d91f05 Mon Sep 17 00:00:00 2001 From: Anders Kvellestad Date: Thu, 7 May 2026 19:01:09 +0200 Subject: [PATCH 24/24] Revised a bunch of overly verbose code comments. --- .../gambit/Printers/printers/hdf5printer.hpp | 15 ++++----------- .../hdf5printer/DataSetInterfaceBase.hpp | 19 ++++++------------- .../Printers/printers/hdf5printer_v2.hpp | 11 +++-------- .../src/printers/hdf5printer/hdf5printer.cpp | 15 +++++---------- .../src/printers/hdf5printer/hdf5tools.cpp | 9 +++------ 5 files changed, 21 insertions(+), 48 deletions(-) diff --git a/Printers/include/gambit/Printers/printers/hdf5printer.hpp b/Printers/include/gambit/Printers/printers/hdf5printer.hpp index a37c56d72e..613b2761ea 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer.hpp @@ -333,15 +333,9 @@ namespace Gambit std::string metadata_group; // HDF5 group location to store metadata // Handles for HDF5 files and groups containing the datasets. - // Default-initialised to -1 (the convention this class already uses - // as the "unset HDF5 handle" sentinel; see e.g. the get_location() / - // get_RA_location() / get_metadata_location() checks). Only the - // primary printer's constructor assigns the file/group handles - // (file_id, group_id, RA_group_id, metadata_id); aux instances - // inherit the *_location_id values from the primary. The default - // initialisers ensure that any code path that mistakenly reads one - // of these on an aux instance gets a deterministic invalid handle - // rather than indeterminate memory. + // Only the primary printer's constructor assigns the file/group + // handles (file_id, group_id, RA_group_id, metadata_id). Aux instances + // inherit the *_location_id values from the primary. hid_t file_id = -1; hid_t group_id = -1; hid_t RA_group_id = -1; @@ -556,8 +550,7 @@ namespace Gambit { // We subtract one because another increment will happen after // the print statement (that triggered the creation of the new - // buffer) completes. (Subtraction is safe since get_sync_pos() > 0. - // Otherwise this would have led to an underflow.) + // buffer) completes. if(synchronised) it->second.fast_forward(printer->get_sync_pos()-1); } } diff --git a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp index e5b0a8233b..8e55d47338 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer/DataSetInterfaceBase.hpp @@ -188,20 +188,13 @@ namespace Gambit { /// Deliberately does NOT close the underlying HDF5 dataset. /// /// DataSetInterfaceBase (and its derived classes) are stored by value - /// and assigned-from-temporary in several places, e.g. - /// _dsetdata = DataSetInterfaceScalar(loc, name, ...); - /// local_buffers[key] = BuffType(loc, label, vertexID, ...); - /// In each case a temporary is constructed, then copy-/move-assigned - /// into a member or container slot, then destroyed. With the default - /// copy semantics both objects hold the same hid_t dset_id, so - /// closing in the destructor would invalidate the handle held by the - /// surviving copy. The actual close therefore lives in the explicit - /// closeDataSet() method below, which is driven by + /// and assigned-from-temporary in several places. In such cases a + /// temporary is constructed, then copy-/move-assigned into a member or + /// container slot, then destroyed. Closing here in the destructor would + /// invalidate the handle held by the surviving copy. The actual close + /// therefore lives in the explicit closeDataSet() method below, used by /// VertexBufferNumeric1D_HDF5::finalise() during the printer's - /// finalise() pass. If finalise() is bypassed (e.g. an exception - /// during scan setup or shutdown), HDF5's own H5Fclose() at process - /// exit will auto-close any still-open dataset handles and flush - /// their pending I/O, so the file on disk remains well-formed. + /// finalise() pass. template DataSetInterfaceBase::~DataSetInterfaceBase() { diff --git a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp index 07f04edaaa..14d2be3d61 100644 --- a/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp +++ b/Printers/include/gambit/Printers/printers/hdf5printer_v2.hpp @@ -78,13 +78,9 @@ namespace Gambit /// Largest allowed size of buffers. Size can be dynamically set from 1 to this number. static const std::size_t MAX_BUFFER_SIZE = 100000; - // The integer-encoded type ID for buffer-data messages - // (formerly used by h5v2_bufdata_type / h5v2_bufdata_values; the - // type encoding itself is still in use by the gather_and_print - // path via HDF5bufferchunk and add_to_buffers). + // The integer-encoded type ID for buffer-data messages. // Need a unique integer for each type. We can encode these // with a template function: - template std::set set_diff(const std::set& set1, const std::set& set2) { @@ -292,9 +288,8 @@ namespace Gambit errmsg << "Error! Tried to write to dataset (name="<file_id, H5F_SCOPE_GLOBAL); if(err<0) { diff --git a/Printers/src/printers/hdf5printer/hdf5tools.cpp b/Printers/src/printers/hdf5printer/hdf5tools.cpp index be934013c8..6f73fd96dd 100644 --- a/Printers/src/printers/hdf5printer/hdf5tools.cpp +++ b/Printers/src/printers/hdf5printer/hdf5tools.cpp @@ -668,12 +668,9 @@ namespace Gambit { /// Close hdf5 group SIMPLE_CALL(hid_t, closeGroup, hid_t, H5Gclose, "close", "group", "group") - /// State for errorsOff() / errorsOn(). The bool guard ensures - /// that a redundant errorsOff() called while errors are already off - /// does NOT overwrite the saved handler with the NULL handler we - /// installed ourselves -- otherwise the matching errorsOn() would - /// "restore" NULL and silently disable HDF5 error reporting for - /// the rest of the process. + /// State for errorsOff() / errorsOn(). The bool guard errors_are_off + /// ensures that a redundant errorsOff() called while errors are already + /// off does NOT overwrite the saved handler with the NULL handler. namespace { bool errors_are_off = false;