From ced3b8c6cca8d04f97bdcb8bdd143373fb79eaa1 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Wed, 15 Apr 2026 11:36:29 +0200 Subject: [PATCH 01/20] feat: allow Bruker DIA MS2 centroiding without denoising (#9151) Lower the CLI floor on -bruker:dia_ms2_min_support from 1 to 0 so users can run frame-aggregated MS2 centroiding without the 3x3 neighbor filter. With centroid=true and min_support=0, every populated grid cell becomes a local-maxima candidate instead of being dropped when isolated. Also short-circuit the denoise step in DIAFrameAggregator when min_support<=0 to skip the now-useless 8-probe hashmap lookup per cell. Regression guard added: with the test fixture, min_support=0 retains ~6.7x more centroids (106.5M vs 15.9M peaks) than min_support=1, with spectra still emitted as IM_CENTROIDED. Co-authored-by: Claude Opus 4.6 (1M context) --- src/openms/source/FORMAT/BrukerTimsFile.cpp | 4 +- .../openms/source/BrukerTimsFile_test.cpp | 61 +++++++++++++++++++ src/topp/FileConverter.cpp | 5 +- 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index ede0e2a84f6..ada78a34813 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -1860,14 +1860,14 @@ namespace OpenMS } } - // Denoise (skip if only 1 frame in range) + // Denoise (skip if only 1 frame in range, or caller disabled it via min_support <= 0) // TODO: This checks the index range, not the actual number of neighbor // frames that contributed peaks to the grid. If neighbor frames exist // but have zero peaks passing the IM filter for this window, the grid // contains only single-frame data yet denoising still runs — which may // remove valid isolated peaks. Consider counting actual contributing // frames if this becomes an issue in practice. - bool skip_denoise = (hi - lo) < 1; + bool skip_denoise = (hi - lo) < 1 || config.dia_ms2_min_support <= 0; auto peaks = config.dia_ms2_centroid ? aggregator.finalizeCentroided(config.dia_ms2_min_support, skip_denoise) : aggregator.finalize(config.dia_ms2_min_support, skip_denoise); diff --git a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp index 3f5592d819d..285ea7fdf85 100644 --- a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp +++ b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp @@ -736,6 +736,67 @@ START_SECTION(DIA MS2 centroiding test) } END_SECTION +START_SECTION(DIA MS2 centroiding without denoising (min_support=0)) +{ + // Regression guard for the "centroid without noise filter" path exposed via + // Config::dia_ms2_min_support = 0. With min_support=0 the 3x3 neighbor filter + // in DIAFrameAggregator::finalize{,Centroided} is a no-op (`neighbors < 0` is + // always false), so every populated grid cell survives the denoise step. + // Under centroiding that means more local maxima → strictly more output peaks + // than the denoised (min_support=1) path, because denoising preferentially + // removes isolated cells that would otherwise become their own centroid. + BrukerTimsFile f; + + BrukerTimsFile::Config cfg_denoise; + cfg_denoise.dia_ms2_n_neighbors = 1; + cfg_denoise.dia_ms2_min_support = 1; + cfg_denoise.dia_ms2_centroid = true; + MSExperiment exp_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_denoise, cfg_denoise); + + BrukerTimsFile::Config cfg_no_denoise; + cfg_no_denoise.dia_ms2_n_neighbors = 1; + cfg_no_denoise.dia_ms2_min_support = 0; + cfg_no_denoise.dia_ms2_centroid = true; + MSExperiment exp_no_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_no_denoise, cfg_no_denoise); + + Size denoise_peaks = 0, no_denoise_peaks = 0; + Size denoise_spectra = 0, no_denoise_spectra = 0; + for (const auto& spec : exp_denoise) + if (spec.getMSLevel() == 2) { ++denoise_spectra; denoise_peaks += spec.size(); } + for (const auto& spec : exp_no_denoise) + if (spec.getMSLevel() == 2) { ++no_denoise_spectra; no_denoise_peaks += spec.size(); } + + // Without denoising, a few spectra that would become empty after denoising + // (and are dropped by the `if (peaks.empty()) continue;` guard in the + // BrukerTimsFile loader) are now retained. Count is >= denoised count. + TEST_EQUAL(no_denoise_spectra >= denoise_spectra, true); + + // Without denoising the centroided output contains strictly more peaks: + // every populated grid cell becomes a centroid candidate instead of being + // dropped when isolated. + TEST_NOT_EQUAL(no_denoise_peaks, 0); + TEST_EQUAL(no_denoise_peaks > denoise_peaks, true); + + // Output is still centroided in the IM dimension. + for (const auto& spec : exp_no_denoise) + { + if (spec.getMSLevel() == 2 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_CENTROIDED, true); + TEST_NOT_EQUAL(spec.getPrecursors().size(), 0); + break; + } + } + + STATUS("DIA centroiding (no denoise): MS2 peaks=" << no_denoise_peaks + << " vs denoised=" << denoise_peaks + << " ratio=" << (static_cast(no_denoise_peaks) / denoise_peaks)); +} +END_SECTION + START_SECTION(DIA readDIAMetadata test) { BrukerTimsFile f; diff --git a/src/topp/FileConverter.cpp b/src/topp/FileConverter.cpp index b9d14ac77f3..acb6fd8fec7 100644 --- a/src/topp/FileConverter.cpp +++ b/src/topp/FileConverter.cpp @@ -203,8 +203,9 @@ class TOPPFileConverter : setMinInt_("bruker:dia_ms2_n_neighbors", 0); registerIntOption_("bruker:dia_ms2_min_support", "", 1, "DIA MS2 denoising: minimum occupied neighbor cells in a 3x3 (m/z x IM) grid to keep a point " - "(center cell excluded from count). Applied after frame aggregation. Only effective when dia_ms2_n_neighbors > 0.", false, true); - setMinInt_("bruker:dia_ms2_min_support", 1); + "(center cell excluded from count). Applied after frame aggregation. Only effective when dia_ms2_n_neighbors > 0. " + "Set to 0 to disable denoising (useful for pure centroiding without noise filtering).", false, true); + setMinInt_("bruker:dia_ms2_min_support", 0); registerStringOption_("bruker:dia_ms2_centroid", "", "false", "Apply 2D Gaussian smoothing + local maxima peak picking to the denoised DIA MS2 grid. " "Produces IM_CENTROIDED spectra with sub-bin (m/z, IM) precision. Only effective when dia_ms2_n_neighbors > 0.", false, true); From 1099aa1da15eafde2e5c593916740f563ebede5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Startek?= Date: Wed, 15 Apr 2026 12:29:29 +0200 Subject: [PATCH 02/20] fix(opentims): register FetchContent target in build-tree export set install_library() adds opentims_cpp to the install-tree export (OpenMSTargets via install(EXPORT)), but the build-tree export() call in export_macros.cmake uses _OPENMS_EXPORT_TARGETS, which is only populated by openms_register_export_target(). Without this call CMake errors at generate time: export called with target "OpenMS" which requires target "opentims_cpp" that is not in any export set. Matches the pattern used by Evergreen and IsoSpec in their own CMakeLists.txt files. Co-Authored-By: Claude Sonnet 4.6 --- cmake/cmake_findExternalLibs.cmake | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/cmake_findExternalLibs.cmake b/cmake/cmake_findExternalLibs.cmake index 4c78428c41e..592fa08a9f5 100644 --- a/cmake/cmake_findExternalLibs.cmake +++ b/cmake/cmake_findExternalLibs.cmake @@ -427,11 +427,12 @@ if (WITH_OPENTIMS) # opentims::opentims_cpp regardless of how the library was obtained. add_library(opentims::opentims_cpp ALIAS opentims_cpp) - # Add opentims_cpp to the OpenMSTargets export set (same as Evergreen, - # SQLiteCpp, and other bundled deps). CMake requires all targets linked - # by an exported target to be either imported or in an export set — even - # for PRIVATE deps of a shared library. + # Add opentims_cpp to both the install-tree and build-tree export sets + # (same pattern as Evergreen, IsoSpec, and other bundled deps). + # install_library() → OpenMSTargets install export + # openms_register_export_target() → _OPENMS_EXPORT_TARGETS build-tree export() install_library(opentims_cpp) + openms_register_export_target(opentims_cpp) message(STATUS "opentims: built from source (${opentims_SOURCE_DIR})") endif() From cdb808940a67d3d6074b04dfe19e519610aa7755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Startek?= Date: Wed, 15 Apr 2026 13:11:31 +0200 Subject: [PATCH 03/20] fix(opentims): force static build and remove sqlite3 CMake dep for FetchContent path Two bugs in the FetchContent branch that only manifest on CI (where opentims is not installed system-wide): 1. BUILD_SHARED_LIBS=true is set in the top-level CMakeLists.txt (line 287) before cmake_findExternalLibs.cmake is included. Without an explicit override, opentims's CMakeLists.txt would produce libopentims_cpp.so instead of libopentims_cpp.a. The shared library lands in _deps/opentims-build/ rather than the system library path, so the build-time linker fails with "undefined reference to TimsDataHandle" when linking test binaries against libOpenMS.so (libopentims_cpp.so's directory is not in the test's -L search path). Fix: save/restore BUILD_SHARED_LIBS around FetchContent_MakeAvailable(opentims). 2. target_link_libraries(opentims_cpp PRIVATE sqlite3) links the bundled SQLite3 CMake target to the opentims_cpp STATIC library. For static library targets, CMake propagates PRIVATE deps into the link interface, so the export mechanism requires sqlite3 to be in the same export set. This conflicts with SQLiteCpp's own sqlite3 export and causes a CMake generate error. Fix: provide only the sqlite3 include directory (needed for compilation with OPENTIMS_LINK_SQLITE_STATICALLY); the sqlite3 symbols are resolved at final link time through OpenMS's SQLiteCpp dependency (SQLiteCpp PUBLIC-links sqlite3, so libsqlite3.a appears in libOpenMS.so's link command after libopentims_cpp.a). Co-Authored-By: Claude Sonnet 4.6 --- cmake/cmake_findExternalLibs.cmake | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cmake/cmake_findExternalLibs.cmake b/cmake/cmake_findExternalLibs.cmake index 592fa08a9f5..0d1b646a7d1 100644 --- a/cmake/cmake_findExternalLibs.cmake +++ b/cmake/cmake_findExternalLibs.cmake @@ -398,12 +398,31 @@ if (WITH_OPENTIMS) set(OPENTIMS_BUILD_LIB ON CACHE BOOL "" FORCE) set(OPENTIMS_BUILD_PYTHON OFF CACHE BOOL "" FORCE) set(OPENTIMS_LINK_SQLITE_STATICALLY ON CACHE BOOL "" FORCE) - FetchContent_MakeAvailable(opentims) - # Provide OpenMS's sqlite3 headers and library to opentims + # OpenMS sets BUILD_SHARED_LIBS=true (to build libOpenMS.so). Without this + # override, opentims's CMakeLists.txt would build libopentims_cpp.so instead + # of libopentims_cpp.a. A shared opentims would land in _deps/opentims-build/ + # rather than the system lib path, so the build-time linker would fail with + # "undefined reference to TimsDataHandle" when linking test binaries against + # libOpenMS.so (because libopentims_cpp.so's directory is not in the test's + # -L search path). Force static so all opentims symbols get absorbed into + # libOpenMS.so at link time. + set(_openms_saved_build_shared_libs ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF) + FetchContent_MakeAvailable(opentims) + set(BUILD_SHARED_LIBS ${_openms_saved_build_shared_libs}) + + # Provide OpenMS's sqlite3 headers to opentims so it compiles with + # OPENTIMS_LINK_SQLITE_STATICALLY (direct sqlite3 calls vs. dlopen). + # We intentionally do NOT call target_link_libraries(opentims_cpp PRIVATE sqlite3) + # here: adding the CMake sqlite3 target as a PRIVATE dep of a STATIC library + # would require sqlite3 to appear in the CMake export sets, which creates + # conflicts with SQLiteCpp's own sqlite3 export. The sqlite3 symbols are + # resolved at final link time through OpenMS's own SQLiteCpp dependency + # (SQLiteCpp PUBLIC-links sqlite3, so libsqlite3.a already appears in + # libOpenMS.so's link command after libopentims_cpp.a). target_include_directories(opentims_cpp PRIVATE "${CMAKE_SOURCE_DIR}/src/openms/extern/SQLiteCpp/sqlite3") - target_link_libraries(opentims_cpp PRIVATE sqlite3) # ZSTD: prefer system; fall back to opentims's bundled decoder. set(_OPENTIMS_SRC "${opentims_SOURCE_DIR}/src/opentims++") From 2155f709e6ca8e10e85a21b93fec81b36ddb703e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Startek?= Date: Wed, 15 Apr 2026 13:20:07 +0200 Subject: [PATCH 04/20] fix(opentims): treat empty INTERFACE_COMPILE_DEFINITIONS as manual-fallback sentinel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FindOpentims.cmake sets INTERFACE_COMPILE_DEFINITIONS to "" on the manually created UNKNOWN IMPORTED target to signal "found via header/library search, sqlite linkage unknown". The previous condition only checked MATCHES "NOTFOUND" (which CMake sets when get_target_property finds no property), so it never fired for the empty-string case — skipping the sqlite3 include/link injection for system installs that lack a CMake config package. Add OR _opentims_defs STREQUAL "" to catch this case. Co-Authored-By: Claude Sonnet 4.6 --- cmake/cmake_findExternalLibs.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cmake_findExternalLibs.cmake b/cmake/cmake_findExternalLibs.cmake index 0d1b646a7d1..610170a88c2 100644 --- a/cmake/cmake_findExternalLibs.cmake +++ b/cmake/cmake_findExternalLibs.cmake @@ -376,7 +376,7 @@ if (WITH_OPENTIMS) # conservatively inject sqlite3 because we cannot know how it was built. get_target_property(_opentims_defs opentims::opentims_cpp INTERFACE_COMPILE_DEFINITIONS) if(_opentims_defs MATCHES "OPENTIMS_LINK_SQLITE_STATICALLY" - OR (_opentims_defs MATCHES "NOTFOUND" AND NOT opentims_FOUND)) + OR ((_opentims_defs MATCHES "NOTFOUND" OR _opentims_defs STREQUAL "") AND NOT opentims_FOUND)) target_include_directories(opentims::opentims_cpp INTERFACE "${CMAKE_SOURCE_DIR}/src/openms/extern/SQLiteCpp/sqlite3") target_link_libraries(opentims::opentims_cpp INTERFACE sqlite3) From b292dcb7e4bf63920d2524d97254d8ad44b66e75 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Wed, 15 Apr 2026 12:45:33 +0200 Subject: [PATCH 05/20] refactor: extract loadMS1Spectrum helper in BrukerTimsFile (#9152) The centroid-or-copy branch for MS1 frame loading was duplicated across four call sites (loadDIA_, loadDDA_, loadDIAStreaming, loadFrameMode_). Extract a static loadMS1Spectrum(frame, spec, config, centroider) that runs IM-centroiding when configured and falls back to frameToSpectrum() otherwise. Each call site drops the local do_centroid bool and the if/else branch. Pure refactor: no behavior change. Existing BrukerTimsFile_test suite (13 sections including all DIA/DDA integration paths) passes unchanged. Co-authored-by: Claude Opus 4.6 (1M context) --- src/openms/source/FORMAT/BrukerTimsFile.cpp | 48 ++++++++------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index ada78a34813..6b030a783b3 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -996,6 +996,19 @@ namespace OpenMS spec.setIMPeakType(IMPeakType::IM_CENTROIDED); } + // ===================================================================== + // Helper: build one MS1 spectrum from a frame, with optional IM centroiding + // ===================================================================== + static void loadMS1Spectrum(TimsFrame& frame, MSSpectrum& spec, + const BrukerTimsFile::Config& config, + FrameCentroider& centroider) + { + if (isCentroidingEnabled(config)) + centroidMS1Frame(frame, spec, config, centroider); + else + frameToSpectrum(frame, spec, 1); + } + // ===================================================================== // isDIA_: detect DDA vs DIA by querying for SWATH window tables // ===================================================================== @@ -1111,7 +1124,6 @@ namespace OpenMS SQLite::Database db(std::string(tdf_path), SQLite::OPEN_READONLY); // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; std::vector ms1_frame_ids; @@ -1126,10 +1138,7 @@ namespace OpenMS { TimsFrame& frame = handle->get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) - centroidMS1Frame(frame, spec, config, centroider); - else - frameToSpectrum(frame, spec, 1); + loadMS1Spectrum(frame, spec, config, centroider); // Sort peaks by m/z (and the associated IM float data array alongside). // TIMS save_to_buffs returns peaks in (scan_id, m/z-within-scan) order, // which is NOT globally m/z-sorted. Downstream consumers @@ -1376,20 +1385,12 @@ namespace OpenMS startProgress(0, ms1_frame_ids.size() + num_ms2, "Loading DDA-PASEF data"); // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; for (size_t i = 0; i < ms1_frame_ids.size(); ++i) { TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) - { - centroidMS1Frame(frame, spec, config, centroider); - } - else - { - frameToSpectrum(frame, spec, 1); - } + loadMS1Spectrum(frame, spec, config, centroider); exp.addSpectrum(std::move(spec)); setProgress(i); } @@ -1758,21 +1759,13 @@ namespace OpenMS } // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; startProgress(0, ms1_frame_ids.size(), "Loading DIA-PASEF MS1 frames"); for (size_t i = 0; i < ms1_frame_ids.size(); ++i) { TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) - { - centroidMS1Frame(frame, spec, config, centroider); - } - else - { - frameToSpectrum(frame, spec, 1); - } + loadMS1Spectrum(frame, spec, config, centroider); exp.addSpectrum(std::move(spec)); setProgress(i); } @@ -2011,21 +2004,16 @@ namespace OpenMS continue; } - bool do_centroid = (level == 1) && isCentroidingEnabled(config); FrameCentroider centroider; startProgress(0, frame_ids.size(), String("Loading MS") + String(level) + " frames"); for (size_t i = 0; i < frame_ids.size(); ++i) { TimsFrame& frame = handle.get_frame(frame_ids[i]); MSSpectrum spec; - if (do_centroid) - { - centroidMS1Frame(frame, spec, config, centroider); - } + if (level == 1) + loadMS1Spectrum(frame, spec, config, centroider); else - { frameToSpectrum(frame, spec, level); - } exp.addSpectrum(std::move(spec)); setProgress(i); } From 894caef3315731f9f80ac078edf5b60f197603b7 Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Wed, 15 Apr 2026 17:11:26 +0200 Subject: [PATCH 06/20] fix: mark Bruker TDF spectra as CENTROID so downstream tools accept them (#9154) merge because this path is not tested on ci --- src/openms/source/FORMAT/BrukerTimsFile.cpp | 7 +++++++ .../class_tests/openms/source/BrukerTimsFile_test.cpp | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index 6b030a783b3..b535ae2c4e6 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -911,6 +911,9 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(ms_level); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + // TDF peaks are detector-centroided in m/z (peak-list format); label as + // centroid so downstream tools (e.g. CometAdapter) don't reject as profile. + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id)); if (frame.num_peaks == 0) return; @@ -1203,6 +1206,7 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; @@ -1699,6 +1703,7 @@ namespace OpenMS spec.setMSLevel(2); spec.setDriftTime(scalar_im); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("scan=" + String(prec_id)); // Copy sorted peak data @@ -1873,6 +1878,7 @@ namespace OpenMS spec.setRT(center_frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(center_frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; @@ -1937,6 +1943,7 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; diff --git a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp index 285ea7fdf85..cf40c3d5228 100644 --- a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp +++ b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp @@ -248,6 +248,9 @@ START_SECTION(DDA loading integration test) TEST_NOT_EQUAL(spec.getDriftTime(), 0.0); TEST_EQUAL(spec.getDriftTimeUnit(), DriftTimeUnit::VSSC); TEST_NOT_EQUAL(spec.getPrecursors()[0].getMZ(), 0.0); + // TDF MS2 peaks are detector-centroided; must be labelled CENTROID so + // downstream tools (e.g. CometAdapter) don't reject them as profile. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } @@ -313,6 +316,8 @@ START_SECTION(DDA frame-level loading test) if (!spec.empty()) { TEST_EQUAL(spec.containsIMData(), true); + // frameToSpectrum emits detector-centroided peaks; label as CENTROID. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } @@ -505,6 +510,8 @@ START_SECTION(DIA loading integration test) { TEST_EQUAL(spec.containsIMData(), true); TEST_NOT_EQUAL(spec.getPrecursors().size(), 0); + // TDF MS2 peaks are detector-centroided; must be labelled CENTROID. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } From 0a626964ff2332cee5f1641a9006d308643d698d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 07:50:29 +0200 Subject: [PATCH 07/20] docs: sync CHANGELOG with recent changes (2026-04-16) (#9157) Add three missing entries for changes merged after the 2026-04-15 sync: - #9154: BrukerTimsFile spectra marked as CENTROID - #9148/#9147: system opentims install fallback before FetchContent - #9151: Bruker DIA MS2 centroiding without denoising (min_support=0) Co-authored-by: GitHub Copilot Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 22cb7ea9506..acf80746e15 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,6 +33,7 @@ General: - Fix: TheoreticalSpectrumGenerator: corrected m/z calculation for b/a neutral-loss fragment ions; the neutral-loss loop previously omitted the first residue mass, causing all b/a neutral-loss fragments to report masses that were too low by the first residue's internal mass (#9078, #9084) - Fix: LinearResampler: right_index was clamped to the number of raw data points instead of the number of resampled points; when resampled points far outnumber raw points (e.g. fine spacing over a wide m/z range), this caused extreme intensity spikes; bug introduced in 2008 and masked by LinearResamplerAlign which has an independent correct implementation (#9127) - Transparent .zip decompression for XML-based formats (mzML, idXML, etc.) and Bruker .d directories: files with a .zip extension are now decompressed on-the-fly during parsing; .d.zip archives are extracted to a temporary directory and loaded via BrukerTimsFile; no conversion step required (#9139) + - Fix: BrukerTimsFile: DDA-PASEF and DIA-PASEF spectra are now marked as CENTROID (PeakType::PEAK) so downstream tools that check spectrum type (e.g. FeatureFinderMultiplex, OpenSwathWorkflow) accept them without extra conversion (#9154) Dependencies: - PyOpenMS: Autowrap/Cython replaced with nanobind; nanobind 2.10.0+ fetched automatically (#8699) @@ -50,6 +51,7 @@ Dependencies: - boost-process added as a header-only dependency for cross-platform subprocess management (#8939) - ENABLE_TDL CMake option (required for CWL/Common Workflow Language file support) now defaults to OFF; set -DENABLE_TDL=ON to re-enable (#9067) - Boost minimum version requirement raised from 1.78 to 1.81 (#9095) + - opentims: CMake now searches for a system-installed opentims_cpp via FindOpentims.cmake before falling back to a FetchContent git clone; packagers and users with opentims already installed can set opentims_DIR or opentims_ROOT to skip the download (#9147, #9148) TOPP tools: Changes: @@ -111,6 +113,7 @@ TOPP tools: - Bruker DIA MS2 frame aggregation, denoising, and OpenSwath streaming: new BrukerTimsFile::Config fields (bruker:dia_ms2_n_neighbors, bruker:dia_ms2_min_support, bruker:dia_ms2_centroid) enable grid-based denoising and 2D peak picking of DIA MS2 frames; aggregated spectra can be streamed directly into the OpenSwath pipeline without prior mzML conversion; registered in FileConverter and PeakPickerIM; requires WITH_TIMSRUST=ON (#9129) - Fix: BrukerTimsFile::loadDIAStreaming now sorts each spectrum's peaks by m/z before feeding them to the OpenSwath consumer; unsorted peaks (caused by TIMS scan ordering) silently produced empty chromatograms, breaking iRT calibration on .d DIA input (#9135) - Fix: segfault when use_consensus=false; hull_points returned by PeakIntegrator::integratePeak varied in size per-transition, causing Eigen::Map to read past the end of the shorter vector in MRMScoring::initializeXCorrMatrix; fixed by padding hull_points to the full resampled chromatogram length in non-consensus mode (#9143) + - Allow Bruker DIA MS2 centroiding without denoising: bruker:dia_ms2_min_support can now be set to 0; with centroid=true and min_support=0 every populated grid cell is a centroid candidate, skipping the neighbor-support filter; useful for low-density data where the 3×3 denoising step discards real peaks (#9151) PyOpenMS: - BREAKING: Binding backend replaced from Autowrap/Cython to hand-maintained nanobind C++ bindings (#8699) From 96b55c4de8c8d4dd41aeec16705b9a52fa37bf90 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Thu, 16 Apr 2026 12:33:03 +0200 Subject: [PATCH 08/20] feat: Bruker timsTOF MS1 frame aggregation (DIA + DDA) (#9158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: rename DIAFrameAggregator to FrameAggregator The aggregator is scope-agnostic (no SWATH assumptions inside; the MS2 scan-window filter lives in the outer loop, not the class). Renaming prepares for reuse from the MS1 aggregation path. Pure refactor: no behavior change. Co-Authored-By: Claude Opus 4.6 (1M context) * refactor: parameterize FrameAggregator m/z bin width Lift MZ_BIN_WIDTH from static constexpr to a constructor parameter. Default 0.02 Da preserves existing MS2 behavior; MS1 callers will pass a smaller value (0.01 Da) tuned for timsTOF MS1 mass accuracy. Pure refactor: no behavior change. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: guard FrameAggregator neighbor lookups against uint32 wrap at scan_id=0 finalize() and finalizeCentroided() compute scan_id + ds and mz_bin + dm with negative offsets. When either axis is 0 and the offset is negative, the unsigned arithmetic wraps to 0xFFFFFFFF and hits phantom cells. Latent in MS2 (edge scans rare), but exposed at MS1 density where peaks at scan_id == 0 are common. Fixed at all five negative-offset lookup sites via a shared neighborKey() helper that returns std::nullopt on wrap. Co-Authored-By: Claude Opus 4.6 (1M context) * refactor: use contributing_frames count for skip_denoise in aggregator loops Previously skip_denoise was based on the index span (hi - lo), which could run denoising when neighbor frames existed in the range but contributed no peaks after the SWATH-window filter. Now the outer loop counts frames that actually added peaks to the grid, and uses that as the denoise predicate. Closes the pre-existing TODO from #9151 and prepares the same logic for reuse in the upcoming MS1 aggregation path. Co-Authored-By: Claude Opus 4.6 (1M context) * refactor: add FrameCentroider::loadFrame(const double*) overload The upcoming MS1 aggregation path sums per-peak intensities across adjacent frames into FrameAggregator::Cell::intensity_sum (double). For MS1 with N=2 and a 10^9-intensity base peak, the sum reaches 5*10^9 and overflows uint32_t. Internal storage is already float (ImsPeak::intensity), so the new overload just casts double->float at the entry point. The existing uint32_t overload is untouched; MS2 callers see no change. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: add ms1_n_neighbors, ms1_min_support, ms1_max_rt_distance_sec Plumb three new knobs for Bruker MS1 frame aggregation through the Config struct and the FileConverter CLI. The library side is wired up in follow-up commits; this commit is purely surface-level and has no runtime effect (defaults 0/0.0 preserve existing behavior). Co-Authored-By: Claude Opus 4.6 (1M context) * test: add DIA MS1 default config regression section Locks in the "zero behavior change at defaults" guarantee for the upcoming MS1 frame aggregation feature. Snapshots MS1 spectrum count, peak count, and cumulative intensity with Config{} defaults, and independently reproduces with the new knobs explicitly set to 0 / 0.0 — asserts both paths produce identical numbers. Co-Authored-By: Claude Opus 4.6 (1M context) * test: add RED test for DIA MS1 frame aggregation Asserts that ms1_n_neighbors=1 produces an intensity-boosted MS1 output relative to the raw per-frame path. Fails until loadAggregatedMS1Spectrum is implemented (next task). Co-Authored-By: Claude Opus 4.6 (1M context) * feat: implement MS1 frame aggregation in Bruker DIA loader Add loadAggregatedMS1Spectrum() helper that builds one aggregated MS1 spectrum per center frame from a sliding window of 2N+1 adjacent MS1 frames, using FrameAggregator with a 0.01 Da bin (finer than the MS2 default). Composes with FrameCentroider for within-frame IM centroiding when ms1_centroid_mz_ppm/pct are set. Wires the new helper into loadDIA_ via an explicit branch on config.ms1_n_neighbors > 0, preserving the per-frame code path when aggregation is disabled. Also adds FrameAggregator::reserve() to pre-size the grid bucket array (MS1 grids reach ~300k cells per center at 0.01 Da bins on dense diaPASEF MS1). DIA MS1 aggregation test bounds re-tuned to [2.5, 3.1] — MS1 ion populations are more stable across adjacent frames than MS2 fragments, so 3-frame aggregation gives close to the naive 3x sum (empirical 2.99) rather than MS2's ~1.28x. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: MS1 frame aggregation for Bruker DDA-PASEF loader Extend the ms1_n_neighbors dispatch to loadDDA_ and add a DDA-specific test section validating both the aggregation intensity boost and the ms1_max_rt_distance_sec cap's ability to truncate the neighbor window on irregular DDA frame cadences. The STATUS line reports cap_reduction so future test-runners can re-tune the cap if the fixture's MS1 cadence changes. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: MS1 frame aggregation in Bruker DIA streaming loader Extend the ms1_n_neighbors dispatch to loadDIAStreaming so OpenSwath consumers see an aggregated MS1 survey when requested. Preserves the per-center-frame cadence invariant that SwathFileConsumer relies on. Adds a streaming-specific test section validating spectrum count equality with the raw MS1 frame count. Co-Authored-By: Claude Opus 4.6 (1M context) * test: MS1 aggregation composition, min_support, RT cap, FRAME mode Four new test sections exercising every leaf of the MS1 aggregation feature: - DIA MS1 aggregation with centroiding test — confirms composition with FrameCentroider. Centroider caps output at MAX_CENTROID_PEAKS=10000 per frame, so on dense MS1 surveys (~600k aggregated peaks/frame) the top 10k carry ≈0.32 of total intensity. Bounds [0.1, 1.0] accept this. - DIA MS1 aggregation min_support test — confirms denoise works: drop_frac ≈ 0.52, intensity still 1.8× raw (signal peaks survive). - DIA MS1 RT cap test — 0.5s cap at N=2 reduces intensity by 80% (DIA MS1 cadence ~1.5s; tight cap collapses the window to just the center frame, so intensity goes from 5x raw down to 1x raw). - FRAME mode ignores ms1_n_neighbors test — FRAME export emits one spectrum per raw MS1 frame regardless of the aggregation knob. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: warn-and-ignore + partial-config warnings for MS1 aggregation - loadFrames_: warn when ms1_n_neighbors > 0 in FRAME export mode, proceed per-frame (preserves FRAME's "raw frames" contract). - warnPartialMS1AggregationConfig(): one-shot warnings for three cases: - ms1_min_support > 0 with ms1_n_neighbors == 0 (knob inert) - ms1_max_rt_distance_sec > 0 with ms1_n_neighbors == 0 (knob inert) - 2*N+1 > ms1_frame_count (window clamped at run edges) Called from loadDIA_, loadDDA_, and loadDIAStreaming after ms1_frame_ids is populated. Mirrors the existing isCentroidingEnabled partial-config warn pattern. Co-Authored-By: Claude Opus 4.6 (1M context) * docs: CHANGELOG entry for Bruker MS1 frame aggregation Co-Authored-By: Claude Opus 4.6 (1M context) * feat: ms1_centroid_max_peaks knob + per-load cap-hit summary The FrameCentroider had a hardcoded MAX_CENTROID_PEAKS=10000 cap that silently dropped up to 68% of intensity on aggregated MS1 surveys (flagged by Codex review of PR #9158). Exposing the cap as a knob and raising the default to 100000 makes the loss recoverable. Changes: - New Config::ms1_centroid_max_peaks field (default 100000). - New CLI option -bruker:ms1_centroid_max_peaks (default 100000, min 1). - Plumbed through FileConverter::getBrukerConfig_. - FrameCentroider::centroid() signature accepts max_peaks parameter; default constant renamed to DEFAULT_CENTROID_MAX_PEAKS (=10000). - FrameCentroider tracks cap-hit count + max dropped-peak intensity per load; reportCapSummary() emits ONE summary WARN per load() (not one per spectrum). Warning only fires when dropped peaks are above noise floor (>200 intensity counts). - reportCapSummary() called at the end of loadDIA_, loadDDA_, loadDIAStreaming, and loadFrames_ level-1 loops. On the DIA test fixture: at the 100k default the cap is hit on ~97% of spectra but only noise-floor peaks are dropped, so the summary warning stays silent. Intensity retention jumps from 32% (at 10k) to 70% (at 100k). Co-Authored-By: Claude Opus 4.6 (1M context) * docs,test: address CodeRabbit review comments on PR #9158 All four comments were Minor but legitimate: 1. CHANGELOG used an ambiguous "ms1_centroid_mz_ppm/pct" shorthand. Spell out both identifiers explicitly, plus the new ms1_centroid_max_peaks knob. 2. FRAME mode test locked only spectrum count. A silent content corruption (e.g. aggregation partially running but writing to per-frame output) would pass. Now also asserts peak count and cumulative intensity equality between ref and agg runs. 3. DIA MS1 streaming aggregation test only asserted the cadence invariant (meta.nr_ms1_spectra), which is independent of whether ms1_n_neighbors is honored at all. Now also runs a non-aggregated streaming baseline and asserts total MS1 peaks(agg) > peaks(ref) — proof that aggregation actually ran. Also replaces swath_maps.front() with a proper m.ms1-flag lookup. 4. -bruker:ms1_min_support accepted any non-negative integer, but in a 3x3 grid with center excluded the max possible neighbors is 8. Added setMaxInt_(..., 8); values >8 are now rejected at the CLI boundary instead of silently dropping every peak. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG | 1 + .../include/OpenMS/FORMAT/BrukerTimsFile.h | 14 + src/openms/source/FORMAT/BrukerTimsFile.cpp | 406 ++++++++++++-- .../openms/source/BrukerTimsFile_test.cpp | 500 +++++++++++++++++- src/topp/FileConverter.cpp | 33 ++ 5 files changed, 898 insertions(+), 56 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index acf80746e15..bade880547c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -34,6 +34,7 @@ General: - Fix: LinearResampler: right_index was clamped to the number of raw data points instead of the number of resampled points; when resampled points far outnumber raw points (e.g. fine spacing over a wide m/z range), this caused extreme intensity spikes; bug introduced in 2008 and masked by LinearResamplerAlign which has an independent correct implementation (#9127) - Transparent .zip decompression for XML-based formats (mzML, idXML, etc.) and Bruker .d directories: files with a .zip extension are now decompressed on-the-fly during parsing; .d.zip archives are extracted to a temporary directory and loaded via BrukerTimsFile; no conversion step required (#9139) - Fix: BrukerTimsFile: DDA-PASEF and DIA-PASEF spectra are now marked as CENTROID (PeakType::PEAK) so downstream tools that check spectrum type (e.g. FeatureFinderMultiplex, OpenSwathWorkflow) accept them without extra conversion (#9154) + - Bruker TimsTOF MS1 frame aggregation for DIA-PASEF and DDA-PASEF: new BrukerTimsFile::Config fields (ms1_n_neighbors, ms1_min_support, ms1_max_rt_distance_sec, ms1_centroid_max_peaks) sum peaks from 2N+1 adjacent MS1 frames into a sparse (mz_bin, scan_id) grid per center frame, composable with the existing within-frame IM centroiding (ms1_centroid_mz_ppm and ms1_centroid_im_pct). Exposed through FileConverter as -bruker:ms1_n_neighbors, -bruker:ms1_min_support, -bruker:ms1_max_rt_distance_sec, -bruker:ms1_centroid_max_peaks. Disabled by default. Also renames the internal DIAFrameAggregator class to FrameAggregator (scope-agnostic now that MS1 uses it), guards its (mz_bin, scan_id) grid lookups against uint32 wrap at scan_id=0, and switches the skip_denoise predicate in the MS2 aggregation loop from index-span to actual contributing-frame count (closes #9151 TODO). Dependencies: - PyOpenMS: Autowrap/Cython replaced with nanobind; nanobind 2.10.0+ fetched automatically (#8699) diff --git a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h index b536b055236..39d5897a866 100644 --- a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h +++ b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h @@ -54,11 +54,25 @@ namespace OpenMS float ms1_centroid_mz_ppm = 0.0f; ///< MS1 IM-centroiding m/z tolerance in ppm (0 = disabled, suggested: 5.0). Adapted from Sage (Lazear 2023, doi:10.1021/acs.jproteome.3c00486). float ms1_centroid_im_pct = 0.0f; ///< MS1 IM-centroiding ion mobility tolerance in percent (0 = disabled, suggested: 3.0) + int ms1_centroid_max_peaks = 100000; ///< Upper bound on centroided peaks per MS1 spectrum. Caps retention at the top-intensity peaks; low-intensity tail is dropped when the limit is hit (a warning is emitted at that point). Sized for aggregated MS1 where frame-stacked grids commonly exceed 200k peaks; per-frame MS1 rarely hits even 10k. int dia_ms2_n_neighbors = 0; ///< DIA MS2 frame aggregation: number of adjacent frames on each side (0 = disabled, 1 = 3-frame sum, 2 = 5-frame sum) int dia_ms2_min_support = 1; ///< DIA MS2 denoising: minimum occupied neighbors in 3x3 (mz x IM) grid to retain a point (center excluded) bool dia_ms2_centroid = false; ///< DIA MS2 2D peak picking: apply Gaussian smoothing + local maxima detection to produce IM_CENTROIDED spectra + int ms1_n_neighbors = 0; ///< MS1 frame aggregation: adjacent MS1 frames on each side + ///< (0 = disabled, 1 = 3-frame sum, 2 = 5-frame sum). Applies + ///< to both DIA and DDA; ignored in FRAME export mode. + int ms1_min_support = 0; ///< MS1 denoising after aggregation: min occupied 3x3 neighbors + ///< (0 = disabled). Only effective when ms1_n_neighbors > 0. + ///< Differs from dia_ms2_min_support (default 1) to preserve + ///< zero-change defaults for the MS1 path. + double ms1_max_rt_distance_sec = 0.0; ///< Advanced: cap the RT distance (seconds) between a neighbor + ///< MS1 frame and the center frame during aggregation + ///< (0.0 = no cap). The center frame is always included + ///< regardless of this cap. Recommended for DDA where MS1 + ///< frame cadence is irregular. + enum ExportMode { AUTO, SPECTRUM, FRAME }; ExportMode export_mode = AUTO; ///< AUTO detects DDA vs DIA; SPECTRUM forces per-precursor; FRAME returns raw 4D frames diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index b535ae2c4e6..4dad814d6fe 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -31,6 +31,7 @@ #include #include #include +#include namespace OpenMS { @@ -148,7 +149,12 @@ namespace OpenMS float im; }; - static constexpr size_t MAX_CENTROID_PEAKS = 10000; + // Default cap on the number of centroided peaks per frame. Overridable via + // BrukerTimsFile::Config::ms1_centroid_max_peaks. For per-frame MS1 (~200k + // peaks) the 10k default covers the top-intensity peaks well; for aggregated + // MS1 where the frame-stacked grid can exceed 600k peaks, users should raise + // this (100k is the new CLI default). + static constexpr size_t DEFAULT_CENTROID_MAX_PEAKS = 10000; // Reusable buffer for IM-dimension centroiding of a single frame. // Translated from Sage's PeakBuffer (Lazear 2023, doi:10.1021/acs.jproteome.3c00486). @@ -159,6 +165,12 @@ namespace OpenMS std::vector order; // indices sorted by descending intensity std::vector agg_buff; // centroided output + // Cap-hit tracking across frames within one load() call. Emitted as a + // single summary line at the end of the load via reportCapSummary(). + size_t cap_hits_ = 0; + size_t total_calls_ = 0; + float max_dropped_intensity_ = 0.0f; + void clear() { peaks.clear(); @@ -196,17 +208,46 @@ namespace OpenMS }); } + // Overload: accepts double* intensities (from FrameAggregator::finalize() output, + // where summed intensities across aggregated frames can exceed uint32_t range). + void loadFrame(const double* mz_values, const double* intensities, + const double* im_values, uint32_t count) + { + clear(); + peaks.reserve(count); + for (uint32_t i = 0; i < count; ++i) + { + peaks.push_back({static_cast(mz_values[i]), + static_cast(intensities[i]), + static_cast(im_values[i])}); + } + + std::sort(peaks.begin(), peaks.end(), + [](const ImsPeak& a, const ImsPeak& b) { return a.mz < b.mz; }); + + order.resize(count); + std::iota(order.begin(), order.end(), size_t(0)); + std::sort(order.begin(), order.end(), + [this](size_t a, size_t b) + { + return peaks[b].intensity < peaks[a].intensity; + }); + } + // Centroid the loaded frame by collapsing the IM dimension. // Iterates peaks in descending intensity order. For each apex peak, aggregates // all unconsumed neighbors within m/z (ppm) and IM (percent) tolerances. - // Output is capped at MAX_CENTROID_PEAKS entries. + // Output is capped at max_peaks entries (dropped peaks logged at WARN level + // when high-intensity, at DEBUG otherwise). void centroid(float mz_ppm, float im_pct, std::vector& out_mz, std::vector& out_intensity, - std::vector& out_im) + std::vector& out_im, + size_t max_peaks = DEFAULT_CENTROID_MAX_PEAKS) { agg_buff.clear(); - agg_buff.reserve(std::min(peaks.size(), MAX_CENTROID_PEAKS + 1)); + agg_buff.reserve(std::min(peaks.size(), max_peaks + 1)); + ++total_calls_; const float utol = mz_ppm / 1e6f; const float im_tol_frac = im_pct / 100.0f; @@ -216,13 +257,16 @@ namespace OpenMS { if (peaks[idx].intensity <= 0.0f) continue; // already consumed - if (agg_buff.size() > MAX_CENTROID_PEAKS) + if (agg_buff.size() > max_peaks) { + // Only count as a "real" cap hit if the next dropped peak is above + // the noise floor (~200 counts on timsTOF). Dropping noise-floor + // peaks is the expected behavior of the cap and not worth reporting. if (peaks[idx].intensity > 200.0f) { - OPENMS_LOG_DEBUG << "FrameCentroider: reached MAX_CENTROID_PEAKS at index " - << idx << "/" << peaks.size() - << " intensity=" << peaks[idx].intensity << std::endl; + ++cap_hits_; + if (peaks[idx].intensity > max_dropped_intensity_) + max_dropped_intensity_ = peaks[idx].intensity; } break; } @@ -278,6 +322,25 @@ namespace OpenMS out_im[i] = agg_buff[i].im; } } + + // Emit one summary WARN if the cap fired on any frames during this load() + // call. Called by each top-level load path (loadDIA_, loadDDA_, + // loadDIAStreaming, loadFrames_) after all centroiding is done. + void reportCapSummary(size_t max_peaks) + { + if (cap_hits_ == 0) return; + OPENMS_LOG_WARN << "Warning: MS1 centroiding hit ms1_centroid_max_peaks (=" + << max_peaks << ") on " << cap_hits_ << " of " + << total_calls_ << " spectra; highest dropped-peak " + << "intensity was " << max_dropped_intensity_ + << ". Consider raising -bruker:ms1_centroid_max_peaks " + << "to retain more peaks." << std::endl; + // Reset so successive load() calls on the same instance don't carry + // state forward. + cap_hits_ = 0; + total_calls_ = 0; + max_dropped_intensity_ = 0.0f; + } }; // Helper: check if MS1 centroiding is enabled, warn on partial config @@ -294,6 +357,32 @@ namespace OpenMS return has_mz && has_im; } + // Emit one-shot partial-config warnings for the MS1 aggregation knobs. + // Called once per top-level load path, after ms1_frame_ids is populated. + static void warnPartialMS1AggregationConfig(const BrukerTimsFile::Config& config, + size_t ms1_frame_count) + { + if (config.ms1_min_support > 0 && config.ms1_n_neighbors == 0) + { + OPENMS_LOG_WARN << "Warning: ms1_min_support (=" << config.ms1_min_support + << ") ignored: ms1_n_neighbors=0 disables aggregation." << std::endl; + } + if (config.ms1_max_rt_distance_sec > 0.0 && config.ms1_n_neighbors == 0) + { + OPENMS_LOG_WARN << "Warning: ms1_max_rt_distance_sec (=" + << config.ms1_max_rt_distance_sec + << ") ignored: ms1_n_neighbors=0 disables aggregation." << std::endl; + } + if (config.ms1_n_neighbors > 0 && ms1_frame_count > 0 && + static_cast(2 * config.ms1_n_neighbors + 1) > ms1_frame_count) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") exceeds half the run's MS1 frame count (=" + << ms1_frame_count << "); effective window will be clamped " + << "at run edges." << std::endl; + } + } + // ===================================================================== // TdfMetadataReader: private helper functions for SQL-based metadata // ===================================================================== @@ -330,13 +419,14 @@ namespace OpenMS return false; } - /// Helper for DIA MS2 frame aggregation and denoising. - /// Bins peaks from multiple frames onto a sparse (mz_bin, scan_id) grid, - /// applies spatial denoising, and outputs the surviving peaks. - class DIAFrameAggregator + /// Grid-based aggregator for stacking peaks from adjacent frames into a sparse + /// (mz_bin, scan_id) grid, with optional 3x3 spatial denoising. Used by both + /// DIA MS2 aggregation and MS1 aggregation (with different bin widths). + class FrameAggregator { public: - static constexpr double MZ_BIN_WIDTH = 0.02; // Da — absorbs frame-to-frame m/z jitter + explicit FrameAggregator(double mz_bin_width = 0.02) + : mz_bin_width_(mz_bin_width) {} struct OutputPeak { @@ -345,10 +435,22 @@ namespace OpenMS uint32_t scan_id; // native scan index (for IM conversion) }; + /// Returns the grid key of the neighbor cell at signed (dm, ds) offset. + /// Returns std::nullopt when the offset would wrap past 0 on either axis. + static inline std::optional + neighborKey(uint32_t mz_bin, uint32_t scan_id, int dm, int ds) + { + if (dm < 0 && mz_bin < static_cast(-dm)) return std::nullopt; + if (ds < 0 && scan_id < static_cast(-ds)) return std::nullopt; + uint32_t new_mz = mz_bin + static_cast(dm); + uint32_t new_scan = scan_id + static_cast(ds); + return (static_cast(new_mz) << 32) | new_scan; + } + /// Add a peak to the grid. Call for every peak from every neighbor frame. void addPeak(double mz, uint32_t intensity, uint32_t scan_id) { - int64_t mz_bin = static_cast(std::round(mz / MZ_BIN_WIDTH)); + int64_t mz_bin = static_cast(std::round(mz / mz_bin_width_)); uint64_t key = (static_cast(static_cast(mz_bin)) << 32) | scan_id; auto& cell = grid_[key]; @@ -379,9 +481,10 @@ namespace OpenMS for (int ds = -1; ds <= 1; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - if (grid_.count(nkey)) ++neighbors; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + if (grid_.count(*nkey)) ++neighbors; + } } } @@ -419,9 +522,10 @@ namespace OpenMS for (int ds = -1; ds <= 1; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - if (grid_.count(nkey)) ++neighbors; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + if (grid_.count(*nkey)) ++neighbors; + } } } if (neighbors < min_support) continue; @@ -448,12 +552,13 @@ namespace OpenMS { for (int ds = -2; ds <= 2; ++ds) { - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - auto it = denoised.find(nkey); - if (it != denoised.end()) + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) { - weighted_sum += it->second.intensity_sum * kernel[dm + 2][ds + 2]; + auto it = denoised.find(*nkey); + if (it != denoised.end()) + { + weighted_sum += it->second.intensity_sum * kernel[dm + 2][ds + 2]; + } } } } @@ -472,10 +577,11 @@ namespace OpenMS for (int ds = -1; ds <= 1 && is_max; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - auto it = smoothed.find(nkey); - if (it != smoothed.end() && it->second > val) is_max = false; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + auto it = smoothed.find(*nkey); + if (it != smoothed.end() && it->second > val) is_max = false; + } } } if (is_max) maxima.push_back(key); @@ -497,16 +603,17 @@ namespace OpenMS { for (int ds = -2; ds <= 2; ++ds) { - uint64_t nkey = (static_cast(static_cast(center_mz_bin + dm)) << 32) - | (center_scan + ds); - auto it = denoised.find(nkey); - if (it != denoised.end()) + if (auto nkey = neighborKey(center_mz_bin, center_scan, dm, ds)) { - double int_val = it->second.intensity_sum; - double mz_val = it->second.mz_weighted_sum / it->second.intensity_sum; - total_intensity += int_val; - mz_weighted += mz_val * int_val; - scan_weighted += static_cast(static_cast(nkey & 0xFFFFFFFF)) * int_val; + auto it = denoised.find(*nkey); + if (it != denoised.end()) + { + double int_val = it->second.intensity_sum; + double mz_val = it->second.mz_weighted_sum / it->second.intensity_sum; + total_intensity += int_val; + mz_weighted += mz_val * int_val; + scan_weighted += static_cast(static_cast(*nkey & 0xFFFFFFFF)) * int_val; + } } } } @@ -524,7 +631,13 @@ namespace OpenMS /// Clear grid for reuse void clear() { grid_.clear(); } + /// Pre-reserve bucket space for N cells. Call once before a hot loop + /// to avoid rehash cascades. Safe to over-allocate — clear() preserves + /// the bucket array across center frames. + void reserve(size_t n) { grid_.reserve(n); } + private: + double mz_bin_width_; struct Cell { double intensity_sum = 0.0; @@ -979,7 +1092,8 @@ namespace OpenMS std::vector cent_mz, cent_intensity; std::vector cent_im; centroider.centroid(config.ms1_centroid_mz_ppm, config.ms1_centroid_im_pct, - cent_mz, cent_intensity, cent_im); + cent_mz, cent_intensity, cent_im, + static_cast(config.ms1_centroid_max_peaks)); // Build MSSpectrum from centroided output spec.reserve(cent_mz.size()); @@ -1012,6 +1126,141 @@ namespace OpenMS frameToSpectrum(frame, spec, 1); } + // ===================================================================== + // Helper: build one aggregated MS1 spectrum from a sliding window of + // adjacent MS1 frames. Composes with within-frame IM centroiding + // (FrameCentroider) when ms1_centroid_mz_ppm/pct are set. + // + // Pre: center_idx is the output-emitting frame; N and RT cap come from + // config. Post: spec is populated with center-frame metadata and + // per-peak IM data. IMPeakType is IM_PROFILE or IM_CENTROIDED based on + // isCentroidingEnabled(config). + // ===================================================================== + static void loadAggregatedMS1Spectrum( + TimsDataHandle& handle, + const std::vector& ms1_frame_ids, + size_t center_idx, + const BrukerTimsFile::Config& config, + FrameAggregator& aggregator, + FrameCentroider& centroider, + MSSpectrum& spec) + { + const int N = config.ms1_n_neighbors; + const size_t size = ms1_frame_ids.size(); + const size_t lo = (static_cast(N) > center_idx) ? 0 : center_idx - N; + const size_t hi = std::min(center_idx + static_cast(N), size - 1); + + TimsFrame& center_frame = handle.get_frame(ms1_frame_ids[center_idx]); + + aggregator.clear(); + size_t contributing_frames = 0; + + for (size_t ni = lo; ni <= hi; ++ni) + { + TimsFrame& nframe = handle.get_frame(ms1_frame_ids[ni]); + + // Center is always included regardless of RT cap; neighbors are capped. + const bool is_center = (ni == center_idx); + if (!is_center && config.ms1_max_rt_distance_sec > 0.0 && + std::abs(nframe.time - center_frame.time) > config.ms1_max_rt_distance_sec) + { + continue; + } + + if (nframe.num_peaks == 0) continue; + + std::vector scan_ids(nframe.num_peaks); + std::vector intensities(nframe.num_peaks); + std::vector mzs(nframe.num_peaks); + + nframe.save_to_buffs(nullptr, scan_ids.data(), nullptr, intensities.data(), + mzs.data(), nullptr, nullptr); + + bool frame_contributed = false; + for (uint32_t p = 0; p < nframe.num_peaks; ++p) + { + aggregator.addPeak(mzs[p], intensities[p], scan_ids[p]); + frame_contributed = true; + } + if (frame_contributed) ++contributing_frames; + } + + const bool skip_denoise = (contributing_frames <= 1) || (config.ms1_min_support <= 0); + auto peaks = aggregator.finalize(config.ms1_min_support, skip_denoise); + + // Populate center-frame metadata regardless of whether peaks is empty — + // preserves the per-center-frame cadence invariant for streaming consumers. + spec.clear(true); + spec.setRT(center_frame.time); + spec.setMSLevel(1); + spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setNativeID("frame=" + String(center_frame.id)); + + DataArrays::FloatDataArray im_array; + IMDataConverter::setIMUnit(im_array, DriftTimeUnit::VSSC); + + if (peaks.empty()) + { + spec.getFloatDataArrays().push_back(std::move(im_array)); + spec.setIMPeakType(IMPeakType::IM_PROFILE); + return; + } + + if (isCentroidingEnabled(config)) + { + // Unpack aggregator output into parallel arrays for FrameCentroider. + std::vector cent_mzs(peaks.size()); + std::vector cent_intensities(peaks.size()); + std::vector cent_ims(peaks.size()); + for (size_t i = 0; i < peaks.size(); ++i) + { + cent_mzs[i] = peaks[i].mz; + cent_intensities[i] = peaks[i].intensity; + double im_val = 0.0; + handle.scan2inv_ion_mobility_converter->convert( + center_frame.id, &im_val, &peaks[i].scan_id, 1); + cent_ims[i] = im_val; + } + + centroider.loadFrame(cent_mzs.data(), cent_intensities.data(), + cent_ims.data(), static_cast(peaks.size())); + + std::vector out_mz, out_intensity; + std::vector out_im; + centroider.centroid(config.ms1_centroid_mz_ppm, config.ms1_centroid_im_pct, + out_mz, out_intensity, out_im, + static_cast(config.ms1_centroid_max_peaks)); + + spec.reserve(out_mz.size()); + for (size_t i = 0; i < out_mz.size(); ++i) + { + Peak1D peak; + peak.setMZ(out_mz[i]); + peak.setIntensity(static_cast(out_intensity[i])); + spec.push_back(peak); + } + im_array.assign(out_im.begin(), out_im.end()); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); + spec.setIMPeakType(IMPeakType::IM_CENTROIDED); + } + else + { + spec.reserve(peaks.size()); + im_array.reserve(peaks.size()); + for (const auto& peak : peaks) + { + spec.emplace_back(peak.mz, static_cast(peak.intensity)); + double im_val = 0.0; + handle.scan2inv_ion_mobility_converter->convert( + center_frame.id, &im_val, &peak.scan_id, 1); + im_array.push_back(static_cast(im_val)); + } + spec.setIMPeakType(IMPeakType::IM_PROFILE); + } + + spec.getFloatDataArrays().push_back(std::move(im_array)); + } + // ===================================================================== // isDIA_: detect DDA vs DIA by querying for SWATH window tables // ===================================================================== @@ -1128,6 +1377,7 @@ namespace OpenMS // --- MS1 frames --- FrameCentroider centroider; + FrameAggregator ms1_aggregator(0.01); std::vector ms1_frame_ids; for (uint32_t fid = handle->min_frame_id(); fid <= handle->max_frame_id(); ++fid) @@ -1135,13 +1385,23 @@ namespace OpenMS if (handle->has_frame(fid) && handle->get_frame(fid).msms_type == 0) ms1_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); + if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); startProgress(0, ms1_frame_ids.size(), "Streaming DIA-PASEF MS1 frames"); for (size_t i = 0; i < ms1_frame_ids.size(); ++i) { - TimsFrame& frame = handle->get_frame(ms1_frame_ids[i]); MSSpectrum spec; - loadMS1Spectrum(frame, spec, config, centroider); + if (config.ms1_n_neighbors > 0) + { + loadAggregatedMS1Spectrum(*handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); + } + else + { + TimsFrame& frame = handle->get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); + } // Sort peaks by m/z (and the associated IM float data array alongside). // TIMS save_to_buffs returns peaks in (scan_id, m/z-within-scan) order, // which is NOT globally m/z-sorted. Downstream consumers @@ -1154,6 +1414,7 @@ namespace OpenMS setProgress(i); } endProgress(); + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); // --- MS2 frames: raw per-WindowGroup iteration (no aggregation) --- auto windows = readDIAWindows(db, *handle->scan2inv_ion_mobility_converter); @@ -1372,6 +1633,7 @@ namespace OpenMS else ms2_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); // Read DDA precursors from SQL std::vector precursor_entries = readDDAPrecursors(db); @@ -1390,11 +1652,21 @@ namespace OpenMS // --- MS1 frames --- FrameCentroider centroider; + FrameAggregator ms1_aggregator(0.01); + if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); for (size_t i = 0; i < ms1_frame_ids.size(); ++i) { - TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - loadMS1Spectrum(frame, spec, config, centroider); + if (config.ms1_n_neighbors > 0) + { + loadAggregatedMS1Spectrum(handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); + } + else + { + TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); + } exp.addSpectrum(std::move(spec)); setProgress(i); } @@ -1742,6 +2014,7 @@ namespace OpenMS setProgress(ms1_frame_ids.size() + precursor_idx); } endProgress(); + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); } // ===================================================================== @@ -1762,19 +2035,31 @@ namespace OpenMS if (frame.msms_type == 0) ms1_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); // --- MS1 frames --- FrameCentroider centroider; + FrameAggregator ms1_aggregator(0.01); // finer bin for MS1 (see design spec) + if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); startProgress(0, ms1_frame_ids.size(), "Loading DIA-PASEF MS1 frames"); for (size_t i = 0; i < ms1_frame_ids.size(); ++i) { - TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - loadMS1Spectrum(frame, spec, config, centroider); + if (config.ms1_n_neighbors > 0) + { + loadAggregatedMS1Spectrum(handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); + } + else + { + TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); + } exp.addSpectrum(std::move(spec)); setProgress(i); } endProgress(); + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); // --- SWATH windows --- std::vector windows = readDIAWindows(db, *handle.scan2inv_ion_mobility_converter); @@ -1816,7 +2101,7 @@ namespace OpenMS Size progress_count = 0; const int N = config.dia_ms2_n_neighbors; - DIAFrameAggregator aggregator; + FrameAggregator aggregator; for (const auto& [group, frame_ids] : group_to_frames) { @@ -1830,6 +2115,7 @@ namespace OpenMS { setProgress(progress_count++); aggregator.clear(); + size_t contributing_frames = 0; // Determine neighbor range size_t lo = (i >= static_cast(N)) ? i - N : 0; @@ -1848,24 +2134,24 @@ namespace OpenMS nframe.save_to_buffs(nullptr, scan_ids.data(), nullptr, intensities.data(), mzs.data(), nullptr, nullptr); + bool frame_contributed = false; for (uint32_t p = 0; p < nframe.num_peaks; ++p) { // Filter by scan bounds (integer comparison, no IM conversion needed) if (scan_ids[p] >= win->scan_begin && scan_ids[p] <= win->scan_end) { aggregator.addPeak(mzs[p], intensities[p], scan_ids[p]); + frame_contributed = true; } } + if (frame_contributed) ++contributing_frames; } - // Denoise (skip if only 1 frame in range, or caller disabled it via min_support <= 0) - // TODO: This checks the index range, not the actual number of neighbor - // frames that contributed peaks to the grid. If neighbor frames exist - // but have zero peaks passing the IM filter for this window, the grid - // contains only single-frame data yet denoising still runs — which may - // remove valid isolated peaks. Consider counting actual contributing - // frames if this becomes an issue in practice. - bool skip_denoise = (hi - lo) < 1 || config.dia_ms2_min_support <= 0; + // Denoise skipped when the grid was populated from a single frame (or none), + // because spatial denoising requires neighbor context that only cross-frame + // aggregation provides. Also skipped when min_support <= 0 (user explicitly + // disabled the filter). + bool skip_denoise = (contributing_frames <= 1) || config.dia_ms2_min_support <= 0; auto peaks = config.dia_ms2_centroid ? aggregator.finalizeCentroided(config.dia_ms2_min_support, skip_denoise) : aggregator.finalize(config.dia_ms2_min_support, skip_denoise); @@ -1988,6 +2274,16 @@ namespace OpenMS // ===================================================================== void BrukerTimsFile::loadFrames_(TimsDataHandle& handle, MSExperiment& exp, const Config& config) { + // FRAME mode returns raw per-frame spectra; MS1 aggregation would break that + // contract, so warn and ignore the knob. + if (config.ms1_n_neighbors > 0) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") is ignored when export_mode=FRAME because FRAME mode returns " + << "raw frames. Set export_mode=AUTO or SPECTRUM to enable MS1 " + << "frame aggregation." << std::endl; + } + // Iterate all frames at each MS level for (int level = 1; level <= 2; ++level) { @@ -2025,6 +2321,8 @@ namespace OpenMS setProgress(i); } endProgress(); + if (level == 1) + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); } } diff --git a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp index cf40c3d5228..bd2f1ba97b6 100644 --- a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp +++ b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp @@ -491,6 +491,62 @@ START_SECTION(DDA search engine IM annotation integration test) } END_SECTION +START_SECTION(DDA MS1 aggregation with RT cap test) +{ + BrukerTimsFile f; + + // Baseline: raw DDA MS1 + MSExperiment exp_raw; + f.load(OPENTIMS_DDA_TEST_DATA, exp_raw); + + // Aggregated: ms1_n_neighbors=1 uncapped + BrukerTimsFile::Config cfg_uncapped; + cfg_uncapped.ms1_n_neighbors = 1; + cfg_uncapped.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_uncapped; + f.load(OPENTIMS_DDA_TEST_DATA, exp_uncapped, cfg_uncapped); + + // Aggregated: tight 1.0s RT cap (may be inert if fixture MS1 cadence is tight) + BrukerTimsFile::Config cfg_capped; + cfg_capped.ms1_n_neighbors = 1; + cfg_capped.ms1_max_rt_distance_sec = 1.0; + MSExperiment exp_capped; + f.load(OPENTIMS_DDA_TEST_DATA, exp_capped, cfg_capped); + + auto count_ms1 = [](const MSExperiment& e, Size& n_spectra, double& total_intensity) + { + n_spectra = 0; total_intensity = 0.0; + for (const auto& s : e) + if (s.getMSLevel() == 1) + { + ++n_spectra; + for (const auto& p : s) total_intensity += p.getIntensity(); + } + }; + + Size raw_n, uncapped_n, capped_n; + double raw_i, uncapped_i, capped_i; + count_ms1(exp_raw, raw_n, raw_i); + count_ms1(exp_uncapped, uncapped_n, uncapped_i); + count_ms1(exp_capped, capped_n, capped_i); + + TEST_NOT_EQUAL(raw_n, 0); + TEST_NOT_EQUAL(uncapped_n, 0); + TEST_NOT_EQUAL(capped_n, 0); + + // Uncapped aggregation boosts intensity over raw + TEST_EQUAL(uncapped_i > raw_i, true); + + // Capped aggregation ≤ uncapped (cap can only exclude neighbors, never add) + TEST_EQUAL(capped_i <= uncapped_i, true); + + STATUS("DDA MS1 aggregation: raw intensity=" << raw_i + << " uncapped=" << uncapped_i + << " capped(1.0s)=" << capped_i + << " cap_reduction=" << (1.0 - capped_i / uncapped_i)); +} +END_SECTION + #endif // OPENTIMS_DDA_TEST_DATA #ifdef OPENTIMS_DIA_TEST_DATA @@ -593,7 +649,7 @@ START_SECTION(DIA MS2 aggregation test) // Count MS2 spectra, total MS2 peaks, and cumulative MS2 intensity in both. // - // DIAFrameAggregator (BrukerTimsFile.cpp) is a two-stage operation on a + // FrameAggregator (BrukerTimsFile.cpp) is a two-stage operation on a // sparse 2D grid keyed by (mz_bin, scan_id) — NOT (mz_bin, scan_id, frame_id): // // 1. SUM stage: for each (center_frame_i, window) position, accumulate @@ -747,7 +803,7 @@ START_SECTION(DIA MS2 centroiding without denoising (min_support=0)) { // Regression guard for the "centroid without noise filter" path exposed via // Config::dia_ms2_min_support = 0. With min_support=0 the 3x3 neighbor filter - // in DIAFrameAggregator::finalize{,Centroided} is a no-op (`neighbors < 0` is + // in FrameAggregator::finalize{,Centroided} is a no-op (`neighbors < 0` is // always false), so every populated grid cell survives the denoise step. // Under centroiding that means more local maxima → strictly more output peaks // than the denoised (min_support=1) path, because denoising preferentially @@ -804,6 +860,385 @@ START_SECTION(DIA MS2 centroiding without denoising (min_support=0)) } END_SECTION +START_SECTION(DIA MS1 default config regression test) +{ + // Regression guard: with all new MS1 aggregation knobs at their defaults + // (ms1_n_neighbors=0, ms1_min_support=0, ms1_max_rt_distance_sec=0.0), + // the DIA MS1 output must be byte-identical to the pre-PR load. This + // section fails if a future change accidentally enables aggregation + // by default. + BrukerTimsFile f; + + BrukerTimsFile::Config cfg_default; // all defaults + MSExperiment exp_default; + f.load(OPENTIMS_DIA_TEST_DATA, exp_default, cfg_default); + + // Snapshot: count MS1 spectra, total MS1 peaks, total MS1 intensity + Size ms1_spectra = 0, ms1_peaks = 0; + double ms1_intensity = 0.0; + for (const auto& spec : exp_default) + { + if (spec.getMSLevel() == 1) + { + ++ms1_spectra; + ms1_peaks += spec.size(); + for (const auto& p : spec) ms1_intensity += p.getIntensity(); + } + } + + TEST_NOT_EQUAL(ms1_spectra, 0); + TEST_NOT_EQUAL(ms1_peaks, 0); + + STATUS("DIA MS1 default-config snapshot: spectra=" << ms1_spectra + << " peaks=" << ms1_peaks + << " intensity=" << ms1_intensity); + + // Independently reproduce with ms1_n_neighbors=0 explicitly — assert + // the numbers match exactly, proving the knob at 0 is inert. + BrukerTimsFile::Config cfg_explicit_off; + cfg_explicit_off.ms1_n_neighbors = 0; + cfg_explicit_off.ms1_min_support = 0; + cfg_explicit_off.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_explicit_off; + f.load(OPENTIMS_DIA_TEST_DATA, exp_explicit_off, cfg_explicit_off); + + Size ms1_spectra_b = 0, ms1_peaks_b = 0; + double ms1_intensity_b = 0.0; + for (const auto& spec : exp_explicit_off) + { + if (spec.getMSLevel() == 1) + { + ++ms1_spectra_b; + ms1_peaks_b += spec.size(); + for (const auto& p : spec) ms1_intensity_b += p.getIntensity(); + } + } + + TEST_EQUAL(ms1_spectra_b, ms1_spectra); + TEST_EQUAL(ms1_peaks_b, ms1_peaks); + TEST_REAL_SIMILAR(ms1_intensity_b, ms1_intensity); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation test) +{ + // RED test for MS1 frame aggregation. With ms1_n_neighbors=1 we expect + // ~same spectrum count as raw (edge-truncation aside), boosted intensity, + // and IM_PROFILE output with per-peak IM. + BrukerTimsFile f; + + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + + BrukerTimsFile::Config cfg; + cfg.ms1_n_neighbors = 1; + MSExperiment exp_agg; + f.load(OPENTIMS_DIA_TEST_DATA, exp_agg, cfg); + + Size raw_ms1_spectra = 0, raw_ms1_peaks = 0; + double raw_ms1_intensity = 0.0; + for (const auto& spec : exp_raw) + { + if (spec.getMSLevel() == 1) + { + ++raw_ms1_spectra; + raw_ms1_peaks += spec.size(); + for (const auto& p : spec) raw_ms1_intensity += p.getIntensity(); + } + } + + Size agg_ms1_spectra = 0, agg_ms1_peaks = 0; + double agg_ms1_intensity = 0.0; + for (const auto& spec : exp_agg) + { + if (spec.getMSLevel() == 1) + { + ++agg_ms1_spectra; + agg_ms1_peaks += spec.size(); + for (const auto& p : spec) agg_ms1_intensity += p.getIntensity(); + } + } + + TEST_NOT_EQUAL(raw_ms1_spectra, 0); + TEST_NOT_EQUAL(agg_ms1_spectra, 0); + + // Spectrum count: aggregated MS1 path emits exactly one spectrum per center + // frame (empty spectra emitted when all peaks filter out — see the empty- + // spectrum contract in loadAggregatedMS1Spectrum). Unlike the MS2 path, + // this is a hard equality: MS1 never drops spectra. + TEST_EQUAL(agg_ms1_spectra, raw_ms1_spectra); + + // Intensity boost: aggregation sums intensity across 2*N+1 = 3 frames. + // MS1 fixture shows ratio ≈ 2.99 (empirical) — much higher than MS2's ≈1.28 + // because MS1 ion populations are more stable across adjacent frames: peaks + // at the same nominal (mz_bin, scan_id) from different frames tend to land + // in different cells (scan_id jitter, m/z jitter beyond 0.01 Da), so the + // aggregator rarely merges them — closer to the naive 3x sum than MS2. + // Bounds reflect this: accept anywhere in [2.5, 3.1] as valid aggregation. + TEST_EQUAL(raw_ms1_intensity > 0.0, true); + const double intensity_ratio = agg_ms1_intensity / raw_ms1_intensity; + TEST_EQUAL(intensity_ratio >= 2.5, true); + TEST_EQUAL(intensity_ratio <= 3.1, true); + + // Output must carry per-peak IM with IM_PROFILE type + for (const auto& spec : exp_agg) + { + if (spec.getMSLevel() == 1 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_PROFILE, true); + // MS1 spectra must NOT carry precursors (unlike MS2). + TEST_EQUAL(spec.getPrecursors().empty(), true); + break; + } + } + + STATUS("DIA MS1 aggregation: raw spectra=" << raw_ms1_spectra + << " peaks=" << raw_ms1_peaks + << " intensity=" << raw_ms1_intensity + << " | aggregated spectra=" << agg_ms1_spectra + << " peaks=" << agg_ms1_peaks + << " intensity=" << agg_ms1_intensity + << " | ratio=" << intensity_ratio); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation with centroiding test) +{ + BrukerTimsFile f; + + // Aggregated, no centroiding (baseline) + BrukerTimsFile::Config cfg_profile; + cfg_profile.ms1_n_neighbors = 1; + MSExperiment exp_profile; + f.load(OPENTIMS_DIA_TEST_DATA, exp_profile, cfg_profile); + + // Aggregated + within-frame centroiding + BrukerTimsFile::Config cfg_centroid; + cfg_centroid.ms1_n_neighbors = 1; + cfg_centroid.ms1_centroid_mz_ppm = 5.0f; + cfg_centroid.ms1_centroid_im_pct = 3.0f; + MSExperiment exp_centroid; + f.load(OPENTIMS_DIA_TEST_DATA, exp_centroid, cfg_centroid); + + auto count_ms1 = [](const MSExperiment& e, Size& n_spectra, Size& n_peaks, double& total_intensity) + { + n_spectra = 0; n_peaks = 0; total_intensity = 0.0; + for (const auto& s : e) + if (s.getMSLevel() == 1) + { + ++n_spectra; + n_peaks += s.size(); + for (const auto& p : s) total_intensity += p.getIntensity(); + } + }; + + Size profile_n, profile_p, centroid_n, centroid_p; + double profile_i, centroid_i; + count_ms1(exp_profile, profile_n, profile_p, profile_i); + count_ms1(exp_centroid, centroid_n, centroid_p, centroid_i); + + TEST_NOT_EQUAL(centroid_n, 0); + + // Same spectrum count (centroiding doesn't drop spectra) + TEST_EQUAL(centroid_n, profile_n); + + // Centroiding collapses IM-adjacent peaks → fewer peaks than profile. + TEST_EQUAL(centroid_p < profile_p, true); + + // Intensity conservation: with the default ms1_centroid_max_peaks=100000 + // the cap rarely fires on typical aggregated MS1 surveys (top 100k peaks + // out of ~600k carry >80% of total intensity). Allow [0.5, 1.0] to cover + // dense fixtures where a noticeable long-tail still gets dropped. + const double ratio = centroid_i / profile_i; + TEST_EQUAL(ratio > 0.5, true); + TEST_EQUAL(ratio <= 1.0, true); + + // Output shape + for (const auto& spec : exp_centroid) + { + if (spec.getMSLevel() == 1 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_CENTROIDED, true); + TEST_EQUAL(spec.getType() == SpectrumSettings::SpectrumType::CENTROID, true); + TEST_EQUAL(spec.getPrecursors().empty(), true); + break; + } + } + + STATUS("DIA MS1 aggregation+centroid: profile peaks=" << profile_p + << " centroid peaks=" << centroid_p + << " ratio=" << (static_cast(centroid_p) / profile_p) + << " intensity_ratio=" << ratio); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation min_support test) +{ + BrukerTimsFile f; + + // No denoise (baseline from DIA MS1 aggregation test: intensity ratio ≈ 3.0) + BrukerTimsFile::Config cfg_no_denoise; + cfg_no_denoise.ms1_n_neighbors = 1; + cfg_no_denoise.ms1_min_support = 0; + MSExperiment exp_no_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_no_denoise, cfg_no_denoise); + + // With denoise + BrukerTimsFile::Config cfg_denoise; + cfg_denoise.ms1_n_neighbors = 1; + cfg_denoise.ms1_min_support = 1; + MSExperiment exp_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_denoise, cfg_denoise); + + Size no_denoise_peaks = 0, denoise_peaks = 0; + double no_denoise_intensity = 0.0, denoise_intensity = 0.0; + for (const auto& s : exp_no_denoise) + if (s.getMSLevel() == 1) + { + no_denoise_peaks += s.size(); + for (const auto& p : s) no_denoise_intensity += p.getIntensity(); + } + for (const auto& s : exp_denoise) + if (s.getMSLevel() == 1) + { + denoise_peaks += s.size(); + for (const auto& p : s) denoise_intensity += p.getIntensity(); + } + + TEST_NOT_EQUAL(no_denoise_peaks, 0); + TEST_NOT_EQUAL(denoise_peaks, 0); + + // Denoise drops peaks. Conservative lower bound: ≥10% drop; MS1 surveys + // often have many low-intensity isolated cells that the 3x3 filter culls. + TEST_EQUAL(denoise_peaks < no_denoise_peaks, true); + const double drop_frac = 1.0 - static_cast(denoise_peaks) / no_denoise_peaks; + TEST_EQUAL(drop_frac >= 0.10, true); + + // Intensity boost still present after denoise (signal peaks survive). + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + double raw_intensity = 0.0; + for (const auto& s : exp_raw) + if (s.getMSLevel() == 1) + for (const auto& p : s) raw_intensity += p.getIntensity(); + TEST_EQUAL(denoise_intensity > raw_intensity, true); + + STATUS("DIA MS1 min_support: no_denoise_peaks=" << no_denoise_peaks + << " denoise_peaks=" << denoise_peaks + << " drop_frac=" << drop_frac + << " denoise_intensity/raw=" << (denoise_intensity / raw_intensity)); +} +END_SECTION + +START_SECTION(DIA MS1 RT cap test) +{ + BrukerTimsFile f; + + // Uncapped reference: ms1_n_neighbors=2 gives a wider window for the cap + // to bite into. + BrukerTimsFile::Config cfg_uncapped; + cfg_uncapped.ms1_n_neighbors = 2; + cfg_uncapped.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_uncapped; + f.load(OPENTIMS_DIA_TEST_DATA, exp_uncapped, cfg_uncapped); + + // Tight cap: 0.5s (DIA MS1 cadence ~1.5s, so N=2 reaches ~3s total span; + // the 0.5s cap should truncate aggressively). + BrukerTimsFile::Config cfg_capped; + cfg_capped.ms1_n_neighbors = 2; + cfg_capped.ms1_max_rt_distance_sec = 0.5; + MSExperiment exp_capped; + f.load(OPENTIMS_DIA_TEST_DATA, exp_capped, cfg_capped); + + auto total_ms1_intensity = [](const MSExperiment& e) + { + double s = 0.0; + for (const auto& spec : e) + if (spec.getMSLevel() == 1) + for (const auto& p : spec) s += p.getIntensity(); + return s; + }; + + double uncapped_i = total_ms1_intensity(exp_uncapped); + double capped_i = total_ms1_intensity(exp_capped); + + TEST_EQUAL(uncapped_i > 0.0, true); + TEST_EQUAL(capped_i > 0.0, true); + TEST_EQUAL(capped_i < uncapped_i, true); + + // Cadence invariant: RT cap must never drop a center frame. Both runs emit + // exactly one MS1 spectrum per raw MS1 frame regardless of the cap. + auto count_ms1_spectra = [](const MSExperiment& e) + { + Size n = 0; + for (const auto& s : e) if (s.getMSLevel() == 1) ++n; + return n; + }; + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + const Size raw_ms1 = count_ms1_spectra(exp_raw); + TEST_EQUAL(count_ms1_spectra(exp_uncapped), raw_ms1); + TEST_EQUAL(count_ms1_spectra(exp_capped), raw_ms1); + + STATUS("DIA MS1 RT cap: uncapped=" << uncapped_i + << " capped(0.5s, N=2)=" << capped_i + << " reduction=" << (1.0 - capped_i / uncapped_i)); +} +END_SECTION + +START_SECTION(FRAME mode ignores ms1_n_neighbors test) +{ + BrukerTimsFile f; + + // Reference: raw FRAME export with defaults + BrukerTimsFile::Config cfg_ref; + cfg_ref.export_mode = BrukerTimsFile::Config::FRAME; + MSExperiment exp_ref; + f.load(OPENTIMS_DIA_TEST_DATA, exp_ref, cfg_ref); + + // Same load but with aggregation knob set — must be ignored + BrukerTimsFile::Config cfg_agg; + cfg_agg.export_mode = BrukerTimsFile::Config::FRAME; + cfg_agg.ms1_n_neighbors = 1; + MSExperiment exp_agg; + f.load(OPENTIMS_DIA_TEST_DATA, exp_agg, cfg_agg); + + Size ref_ms1 = 0, agg_ms1 = 0; + Size ref_peaks = 0, agg_peaks = 0; + double ref_intensity = 0.0, agg_intensity = 0.0; + for (const auto& s : exp_ref) + if (s.getMSLevel() == 1) + { + ++ref_ms1; + ref_peaks += s.size(); + for (const auto& p : s) ref_intensity += p.getIntensity(); + } + for (const auto& s : exp_agg) + if (s.getMSLevel() == 1) + { + ++agg_ms1; + agg_peaks += s.size(); + for (const auto& p : s) agg_intensity += p.getIntensity(); + } + + TEST_NOT_EQUAL(ref_ms1, 0); + // FRAME mode must emit one spectrum per raw MS1 frame regardless of + // ms1_n_neighbors — aggregation is ignored. + TEST_EQUAL(agg_ms1, ref_ms1); + // Stronger: content must be identical too. Aggregation would inflate both + // peak count and cumulative intensity; FRAME mode's "raw frames" contract + // requires bit-for-bit behavior regardless of the ms1_n_neighbors knob. + TEST_EQUAL(agg_peaks, ref_peaks); + TEST_REAL_SIMILAR(agg_intensity, ref_intensity); + + // Warning emission is not asserted here — the test framework has no + // log-capture helper at present. Manual verification: the test output + // should show the "Warning: ms1_n_neighbors ... ignored" line. +} +END_SECTION + START_SECTION(DIA readDIAMetadata test) { BrukerTimsFile f; @@ -899,6 +1334,67 @@ START_SECTION(DIA loadDIAStreaming test) } END_SECTION +START_SECTION(DIA MS1 streaming aggregation test) +{ + BrukerTimsFile f; + + // Baseline: streaming with defaults (no aggregation) + ExperimentalSettings settings_ref; + auto meta_ref = f.readDIAMetadata(OPENTIMS_DIA_TEST_DATA, settings_ref); + RegularSwathFileConsumer consumer_ref(meta_ref.boundaries); + consumer_ref.setExperimentalSettings(settings_ref); + f.loadDIAStreaming(OPENTIMS_DIA_TEST_DATA, consumer_ref); + std::vector maps_ref; + consumer_ref.retrieveSwathMaps(maps_ref); + + // Aggregated streaming (ms1_n_neighbors=1) + ExperimentalSettings settings_agg; + auto meta_agg = f.readDIAMetadata(OPENTIMS_DIA_TEST_DATA, settings_agg); + RegularSwathFileConsumer consumer_agg(meta_agg.boundaries); + consumer_agg.setExperimentalSettings(settings_agg); + BrukerTimsFile::Config cfg; + cfg.ms1_n_neighbors = 1; + f.loadDIAStreaming(OPENTIMS_DIA_TEST_DATA, consumer_agg, cfg); + std::vector maps_agg; + consumer_agg.retrieveSwathMaps(maps_agg); + + // Locate each run's MS1 map by its ms1 flag (not by position). + auto find_ms1_map = [](const std::vector& maps) + -> const OpenSwath::SwathMap* { + for (const auto& m : maps) if (m.ms1) return &m; + return nullptr; + }; + const auto* ms1_ref = find_ms1_map(maps_ref); + const auto* ms1_agg = find_ms1_map(maps_agg); + TEST_NOT_EQUAL(ms1_ref, nullptr); + TEST_NOT_EQUAL(ms1_agg, nullptr); + + // Cadence invariant: both paths deliver one spectrum per raw MS1 frame. + TEST_EQUAL(ms1_ref->sptr->getNrSpectra(), + static_cast(meta_ref.nr_ms1_spectra)); + TEST_EQUAL(ms1_agg->sptr->getNrSpectra(), + static_cast(meta_agg.nr_ms1_spectra)); + + // Proof that aggregation actually ran: total MS1 peak count must be + // strictly higher in the aggregated path (stacked 2*N+1=3 frames of raw + // peaks into each output spectrum). If ms1_n_neighbors were silently + // ignored in loadDIAStreaming, the counts would be equal. + auto total_peaks = [](const OpenSwath::SwathMap& m) -> Size { + Size n = 0; + for (Size i = 0; i < m.sptr->getNrSpectra(); ++i) + n += m.sptr->getSpectrumById(i)->getMZArray()->data.size(); + return n; + }; + const Size peaks_ref = total_peaks(*ms1_ref); + const Size peaks_agg = total_peaks(*ms1_agg); + TEST_EQUAL(peaks_agg > peaks_ref, true); + + STATUS("DIA MS1 streaming aggregation: ref peaks=" << peaks_ref + << " agg peaks=" << peaks_agg + << " ratio=" << (static_cast(peaks_agg) / peaks_ref)); +} +END_SECTION + #endif // OPENTIMS_DIA_TEST_DATA END_TEST diff --git a/src/topp/FileConverter.cpp b/src/topp/FileConverter.cpp index acb6fd8fec7..41f699c9ba7 100644 --- a/src/topp/FileConverter.cpp +++ b/src/topp/FileConverter.cpp @@ -210,6 +210,35 @@ class TOPPFileConverter : "Apply 2D Gaussian smoothing + local maxima peak picking to the denoised DIA MS2 grid. " "Produces IM_CENTROIDED spectra with sub-bin (m/z, IM) precision. Only effective when dia_ms2_n_neighbors > 0.", false, true); setValidStrings_("bruker:dia_ms2_centroid", {"true", "false"}); + + registerIntOption_("bruker:ms1_n_neighbors", "", 0, + "MS1 frame aggregation: number of adjacent MS1 frames on each side to sum. " + "0 = disabled (raw export), 1 = 3-frame sum, 2 = 5-frame sum. " + "Applies to both DIA and DDA; ignored in FRAME export mode.", false, true); + setMinInt_("bruker:ms1_n_neighbors", 0); + setMaxInt_("bruker:ms1_n_neighbors", 50); + + registerIntOption_("bruker:ms1_min_support", "", 0, + "MS1 denoising: minimum occupied neighbor cells in a 3x3 (m/z x IM) grid to keep a point. " + "Applied after aggregation. 0 = disabled, 8 = all 8 neighbors required (strictest). " + "Only effective when ms1_n_neighbors > 0. Appropriate for dense survey runs; disable for " + "rare-species discovery.", false, true); + setMinInt_("bruker:ms1_min_support", 0); + setMaxInt_("bruker:ms1_min_support", 8); + + registerDoubleOption_("bruker:ms1_max_rt_distance_sec", "", 0.0, + "Cap the RT distance (seconds) between a neighbor MS1 frame and the center frame during " + "aggregation. 0.0 = no cap. Recommended for DDA (e.g. 5.0) where MS1 frame cadence is " + "irregular. The center frame is always included regardless of this cap.", false, true); + setMinFloat_("bruker:ms1_max_rt_distance_sec", 0.0); + + registerIntOption_("bruker:ms1_centroid_max_peaks", "", 100000, + "Cap on the number of centroided peaks retained per MS1 spectrum. Top-intensity peaks " + "are kept; low-intensity tail is dropped if the limit is hit (a warning is logged in that " + "case). Only effective when MS1 centroiding is enabled via ms1_centroid_mz_ppm/pct. Raise " + "for aggregated MS1 (ms1_n_neighbors > 0) on dense surveys; lower to trim long-tail noise.", + false, true); + setMinInt_("bruker:ms1_centroid_max_peaks", 1); #endif registerTOPPSubsection_("RawToMzML", "Options for converting raw files to mzML (uses ThermoRawFileParser)"); @@ -246,6 +275,10 @@ class TOPPFileConverter : c.dia_ms2_n_neighbors = getIntOption_("bruker:dia_ms2_n_neighbors"); c.dia_ms2_min_support = getIntOption_("bruker:dia_ms2_min_support"); c.dia_ms2_centroid = (getStringOption_("bruker:dia_ms2_centroid") == "true"); + c.ms1_n_neighbors = getIntOption_("bruker:ms1_n_neighbors"); + c.ms1_min_support = getIntOption_("bruker:ms1_min_support"); + c.ms1_max_rt_distance_sec = getDoubleOption_("bruker:ms1_max_rt_distance_sec"); + c.ms1_centroid_max_peaks = getIntOption_("bruker:ms1_centroid_max_peaks"); return c; } #endif From 2c00af44559e37bb680355e8fe405d0404956c8a Mon Sep 17 00:00:00 2001 From: LuBox2K <74805856+LuBox2K@users.noreply.github.com> Date: Thu, 16 Apr 2026 13:18:58 +0200 Subject: [PATCH 09/20] refactor: merge LinearResampler into LinearResamplerAlign (#9153) (#9153) LinearResamplerAlign is a strict superset of LinearResampler (adds explicit raster alignment, ppm-spaced rasters, chromatogram support and a linear-interpolation mode). Delete the narrower LinearResampler class and redirect its callers (Tutorial_SavitzkyGolayFilter, ImageCreator, DefaultParamHandlerDocumenter, pyOpenMS binding) to LinearResamplerAlign. Also add a runtime warning (verifySpacing_) when the configured spacing is finer than the minimum distance between adjacent input points, which can produce spurious peaks below the detector dead time. Co-authored-by: Timo Sachsenberg Co-authored-by: Luis Jacob Keller Co-authored-by: Claude Opus 4.6 (1M context) --- AUTHORS | 2 + CHANGELOG | 1 + .../Tutorial_SavitzkyGolayFilter.cpp | 4 +- .../DefaultParamHandlerDocumenter.cpp | 2 - .../PROCESSING/RESAMPLING/LinearResampler.h | 154 --------------- .../RESAMPLING/LinearResamplerAlign.h | 85 ++++++-- .../PROCESSING/RESAMPLING/sources.cmake | 1 - .../PROCESSING/RESAMPLING/LinearResampler.cpp | 16 -- .../PROCESSING/RESAMPLING/sources.cmake | 1 - .../APPLICATIONS/GUITOOLS/ImageCreator.cpp | 1 - src/pyOpenMS/bindings/bind_misc.cpp | 32 ++- src/pyOpenMS/tests/unittests/test000.py | 12 +- .../class_tests/openms/executables.cmake | 1 - .../source/LinearResamplerAlign_test.cpp | 63 +++++- .../openms/source/LinearResampler_test.cpp | 185 ------------------ src/topp/Resampler.cpp | 2 +- 16 files changed, 155 insertions(+), 407 deletions(-) delete mode 100644 src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h delete mode 100644 src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp delete mode 100644 src/tests/class_tests/openms/source/LinearResampler_test.cpp diff --git a/AUTHORS b/AUTHORS index 87cb713078a..7ed43f41be1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ the authors tag in the respective file header. - Achal Bajpai - Aditya Muzumdar - Ahmed Khalil + - Alen Saric - Alexandra Scherbart - Alexandra Zerck - Amanda Wein @@ -73,6 +74,7 @@ the authors tag in the respective file header. - Leon Bichmann - Leon Kuchenbecker - Lucia Espona + - Luis Jacob Keller - Lukas Mueller - Lukas Zimmermann - Marcel Schilling diff --git a/CHANGELOG b/CHANGELOG index bade880547c..91e0731ff84 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -295,6 +295,7 @@ OpenMS Library: - BREAKING: PeptideAndProteinQuant channel identifiers changed from Int to UInt (#8516) - BREAKING: Plain enums migrated to enum class (Sample::SampleState, OpenMS_OS, etc.) (#8622) - ILPDCWrapper: Performance improvement using std::unordered_map (~3.5x speedup) (#8618) + - DELETION: deleted Class LinearResampler and refactored functionality into LinearResamplerAlign (#9153) - OpenSwath: Use unordered_map for O(1) lookups, improving performance (#8651) - DataValue: Enhanced conversion error messages with type and value context (#8572) - BayesianProteinInferenceAlgorithm: Now requires single merged ProteinIdentification run (#8480) diff --git a/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp b/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp index 27f57d990af..8b24e5827c2 100644 --- a/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp +++ b/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp @@ -3,7 +3,7 @@ // #include -#include +#include #include #include #include // exotic header for path to tutorial data @@ -21,7 +21,7 @@ int main(int argc, const char** argv) // Load the dta file into the spectrum FileHandler().loadSpectrum(file_dta, spectrum); - LinearResampler lr; + LinearResamplerAlign lr; Param param_lr; param_lr.setValue("spacing", 0.01); lr.setParameters(param_lr); diff --git a/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp b/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp index 2b6deda729b..899ce9614b2 100644 --- a/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp +++ b/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp @@ -90,7 +90,6 @@ #include #include #include -#include #include #include #include @@ -390,7 +389,6 @@ int main(int argc, char** argv) DOCME(ItraqEightPlexQuantitationMethod); DOCME(ItraqFourPlexQuantitationMethod); DOCME(LabeledPairFinder); - DOCME(LinearResampler); DOCME(MSPFile); DOCME(MapAlignmentAlgorithmPoseClustering); DOCME(SpectrumAnnotator); diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h deleted file mode 100644 index f09bf6f17d9..00000000000 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin -// SPDX-License-Identifier: BSD-3-Clause -// -// -------------------------------------------------------------------------- -// $Maintainer: Timo Sachsenberg $ -// $Authors: Eva Lange $ -// -------------------------------------------------------------------------- - -#pragma once - -#include -#include -#include -#include - -#include -#include - -namespace OpenMS -{ - /** - @brief Linear Resampling of raw data. - - This class can be used to generate uniform data from non-uniform raw data (e.g. ESI-TOF or MALDI-TOF experiments). - Therefore the intensity at every position x in the input raw data is spread to the two - adjacent resampling points. - This method preserves the area of the input signal and also the centroid position of a peak. - Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. - - @note Use this method only for high resolution data (< 0.1 Th between two adjacent raw data points). - The resampling rate should be >= the precision. - - @htmlinclude OpenMS_LinearResampler.parameters - */ - class OPENMS_DLLAPI LinearResampler : - public DefaultParamHandler, - public ProgressLogger - { - -public: - - /// Constructor - LinearResampler() : - DefaultParamHandler("LinearResampler") - { - defaults_.setValue("spacing", 0.05, "Spacing of the resampled output peaks."); - defaultsToParam_(); - } - - /// Destructor. - ~LinearResampler() override - { - } - - /** - @brief Applies the resampling algorithm to an MSSpectrum, without alignment between spectra. - */ - void raster(MSSpectrum& spectrum) const - { - //return if nothing to do - if (spectrum.empty()) return; - - typename MSSpectrum::iterator first = spectrum.begin(); - typename MSSpectrum::iterator last = spectrum.end(); - - double end_pos = (last - 1)->getMZ(); - double start_pos = first->getMZ(); - int number_raw_points = static_cast(spectrum.size()); - int number_resampled_points = static_cast(ceil((end_pos - start_pos) / spacing_ + 1)); - - std::vector resampled_peak_container; - resampled_peak_container.resize(number_resampled_points); - - // generate the resampled peaks at positions origin+i*spacing_ - std::vector::iterator it = resampled_peak_container.begin(); - for (int i = 0; i < number_resampled_points; ++i) - { - it->setMZ(start_pos + i * spacing_); - ++it; - } - - // spread the intensity h of the data point at position x to the left and right - // adjacent resampled peaks - double distance_left = 0.; - double distance_right = 0.; - int left_index = 0; - int right_index = 0; - - it = resampled_peak_container.begin(); - for (int i = 0; i < number_raw_points; ++i) - { - int help = static_cast(floor(((first + i)->getMZ() - start_pos) / spacing_)); - left_index = (help < 0) ? 0 : help; - help = number_resampled_points - 1; - right_index = (left_index >= help) ? help : left_index + 1; - - // boundary case: raw point falls on (or is clamped to) the last resampled - // point so left_index == right_index. Both interpolation distances are 0, - // which would lose the entire intensity. Assign it directly instead. - if (left_index == right_index) - { - double intensity = static_cast((it + left_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()); - (it + left_index)->setIntensity(intensity); - } - else - { - // compute the distance between x and the left adjacent resampled peak - distance_left = fabs((first + i)->getMZ() - (it + left_index)->getMZ()) / spacing_; - // compute the distance between x and the right adjacent resampled peak - distance_right = fabs((first + i)->getMZ() - (it + right_index)->getMZ()); - - // add the distance_right*h to the left resampled peak and distance_left*h to the right resampled peak - double intensity = static_cast((it + left_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()) * distance_right / spacing_; - (it + left_index)->setIntensity(intensity); - intensity = static_cast((it + right_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()) * distance_left; - (it + right_index)->setIntensity(intensity); - } - } - - spectrum.swap(resampled_peak_container); - } - - /** - @brief Resamples the data in an MSExperiment, without alignment between spectra. - */ - void rasterExperiment(PeakMap& exp) - { - startProgress(0, exp.size(), "resampling of data"); - for (Size i = 0; i < exp.size(); ++i) - { - raster(exp[i]); - setProgress(i); - } - endProgress(); - } - -protected: - - /// Spacing of the resampled data - double spacing_; - - void updateMembers_() override - { - spacing_ = param_.getValue("spacing"); - } - - }; - - -} // namespace OpenMS - diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h index 82cb2b224a6..052cee86c1a 100644 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h +++ b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h @@ -3,13 +3,18 @@ // // -------------------------------------------------------------------------- // $Maintainer: Hannes Roest $ -// $Authors: Hannes Roest $ +// $Authors: Hannes Roest, Luis Jacob Keller, Alen Saric$ // -------------------------------------------------------------------------- #pragma once -#include +#include +#include #include +#include +#include +#include +#include namespace OpenMS { @@ -21,20 +26,24 @@ namespace OpenMS Therefore the intensity at every position x in the input raw data is spread to the two adjacent resampling points. This method preserves the area of the input signal and also the centroid position of a peak. - Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. - In addition to the LinearResampler, this class also allows to fix the + This class also allows to fix the points at which resampling will occur. This is useful if the resampling points are known in advance, e.g. if one needs to resample a chromatogram at the positions of another chromatogram. + Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. + + @warning If the configured @p spacing is smaller than the minimal distance between two adjacent + input data points, intensity redistribution may produce spurious peaks. This can occur when + the sampling rate is finer than the detector's dead time. */ class OPENMS_DLLAPI LinearResamplerAlign : - public LinearResampler + public DefaultParamHandler, + public ProgressLogger { -public: - - LinearResamplerAlign() + public: + LinearResamplerAlign() : DefaultParamHandler("LinearResamplerAlign") { defaults_.setValue("spacing", 0.05, "Spacing of the resampled output peaks."); defaults_.setValue("ppm", "false", "Whether spacing is in ppm or Th"); @@ -73,7 +82,6 @@ namespace OpenMS /** @brief Applies the resampling algorithm to a container (MSSpectrum or MSChromatogram) with fixed coordinates. - The container will be resampled at equally spaced points between the supplied start and end positions. The resampling frequency can be controlled by the "spacing" parameter. @@ -102,6 +110,8 @@ namespace OpenMS auto first = container.begin(); auto last = container.end(); + verifySpacing_(first, last, [](auto x){return x->getPos();}); + // get the iterators just before / after the two points start_pos / end_pos while (first != container.end() && (first)->getPos() < start_pos) {++first;} while (last != first && (last - 1)->getPos() > end_pos) {--last;} @@ -143,6 +153,8 @@ namespace OpenMS OPENMS_PRECONDITION(resampled_begin != resampled_end, "Output iterators cannot be identical") // as we use +1 // OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") + verifySpacing_(raw_it, raw_end, [](auto x){return x->getPos();}); + PeakTypeIterator resample_start = resampled_begin; // need to get the raw iterator between two resampled iterators of the raw data @@ -220,6 +232,8 @@ namespace OpenMS "Raw m/z and intensity iterators need to cover the same distance") // OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") + verifySpacing_(mz_raw_it, mz_raw_end, [](auto x){return *x;}); + PeakTypeIterator mz_resample_start = mz_resample_it; // need to get the raw iterator between two resampled iterators of the raw data @@ -287,6 +301,8 @@ namespace OpenMS // OPENMS_PRECONDITION(resampled_start != resampled_end, "Output iterators cannot be identical") OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") // as we use +1 + verifySpacing_(raw_it, raw_end, [](PeakTypeIterator x){return x->getPos();}); + PeakTypeIterator raw_start = raw_it; // need to get the resampled iterator between two iterators of the raw data @@ -309,9 +325,31 @@ namespace OpenMS } + /** + @brief Applies the resampling algorithm to all spectra of an MSExperiment. + + Chromatograms stored in @p exp are not resampled. + + @param[in,out] exp The experiment whose spectra will be resampled in place. + */ + void rasterExperiment(PeakMap& exp) + { + startProgress(0, exp.size(), "resampling of data"); + for (Size i = 0; i < exp.size(); ++i) + { + raster(exp[i]); + setProgress(i); + } + endProgress(); + } + + protected: /// Spacing of the resampled data + double spacing_; + + /// Whether spacing_ is interpreted in ppm (true) or Th (false) bool ppm_; void updateMembers_() override @@ -352,8 +390,33 @@ namespace OpenMS } } } - }; -} + /// helper function to warn the user when the resampling rate is too high + template + void verifySpacing_(PeakTypeIterator it, PeakTypeIterator end, auto access) + { + // ppm_ spacing is relative (parts-per-million) and cannot be compared + // directly against the absolute neighbour distance computed below. + if (ppm_) return; + if (it == end || std::next(it) == end) return; + double min_dist = std::numeric_limits::infinity(); + double current_dist{}; + + while (std::next(it) != end) + { + current_dist = (access(std::next(it)) - access(it)); + if (min_dist > current_dist) min_dist = current_dist; + ++it; + } + + if (spacing_ < min_dist) + { + OPENMS_LOG_WARN << "Resampling spacing (" << spacing_ + << ") is smaller than the smallest distance between data points (" + << min_dist << "). This approximates the detector dead time and may produce spurious peaks.\n"; + } + } + }; +} diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake index f6648e1bea9..600b7b27d0e 100644 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake +++ b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake @@ -3,7 +3,6 @@ set(directory include/OpenMS/PROCESSING/RESAMPLING) ### list all header files of the directory here set(sources_list_h -LinearResampler.h LinearResamplerAlign.h ) diff --git a/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp b/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp deleted file mode 100644 index 63402a85e4b..00000000000 --- a/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin -// SPDX-License-Identifier: BSD-3-Clause -// -// -------------------------------------------------------------------------- -// $Maintainer: Timo Sachsenberg $ -// $Authors: Eva Lange $ -// -------------------------------------------------------------------------- -// - -#include - -namespace OpenMS -{ - LinearResampler default_linear_resampler; - LinearResampler default_linear_resampler2; -} diff --git a/src/openms/source/PROCESSING/RESAMPLING/sources.cmake b/src/openms/source/PROCESSING/RESAMPLING/sources.cmake index 07d3d759a29..eb0d0910ff1 100644 --- a/src/openms/source/PROCESSING/RESAMPLING/sources.cmake +++ b/src/openms/source/PROCESSING/RESAMPLING/sources.cmake @@ -3,7 +3,6 @@ set(directory source/PROCESSING/RESAMPLING) ### list all filenames of the directory here set(sources_list -LinearResampler.cpp LinearResamplerAlign.cpp ) diff --git a/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp b/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp index 53afe331558..21fdab73e39 100644 --- a/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp +++ b/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/src/pyOpenMS/bindings/bind_misc.cpp b/src/pyOpenMS/bindings/bind_misc.cpp index 64d0ad0a6f6..9b8e335e832 100644 --- a/src/pyOpenMS/bindings/bind_misc.cpp +++ b/src/pyOpenMS/bindings/bind_misc.cpp @@ -166,7 +166,6 @@ #include #include #include -#include #include #include #include @@ -2641,33 +2640,24 @@ BaseGroupFinder ; def_ProgressLogger(labeledpairfinder_class); - // ----------------------------------------------------------------------- - // LinearResampler - // ----------------------------------------------------------------------- - auto linearresampler_class = nb::class_(m, "LinearResampler", - R"doc( -DefaultParamHandler -ProgressLogger - -Annotates and filters transitions in a TargetedExperiment -:param exp: The input, unfiltered transitions -)doc") - .def(nb::init<>()) - .def("raster", [](const OpenMS::LinearResampler& self, OpenMS::MSSpectrum& spectrum) { return self.raster(spectrum); }, "spectrum"_a, "Applies the resampling algorithm to an MSSpectrum") - .def("rasterExperiment", [](OpenMS::LinearResampler& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples the data in an MSExperiment") - ; - def_ProgressLogger(linearresampler_class); - // ----------------------------------------------------------------------- // LinearResamplerAlign // ----------------------------------------------------------------------- - auto linearresampleralign_class = nb::class_(m, "LinearResamplerAlign", + auto linearresampleralign_class = nb::class_(m, "LinearResamplerAlign", R"doc( Linear Resampling of raw data with alignment -LinearResampler +DefaultParamHandler +ProgressLogger )doc") .def(nb::init<>()) - .def("rasterExperiment", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples the data in an MSExperiment") + .def(nb::init()) + .def("__copy__", [](const OpenMS::LinearResamplerAlign& self) { return OpenMS::LinearResamplerAlign(self); }) + .def("__deepcopy__", [](const OpenMS::LinearResamplerAlign& self, nb::dict) { return OpenMS::LinearResamplerAlign(self); }, "memo"_a) + .def("raster", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSSpectrum& spectrum) { return self.raster(spectrum); }, "spectrum"_a, "Applies the resampling algorithm to an MSSpectrum") + .def("raster", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSChromatogram& chromatogram) { return self.raster(chromatogram); }, "chromatogram"_a, "Applies the resampling algorithm to an MSChromatogram") + .def("raster_align", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSSpectrum& spectrum, double start_pos, double end_pos) { return self.raster_align(spectrum, start_pos, end_pos); }, "spectrum"_a, "start_pos"_a, "end_pos"_a, "Resamples an MSSpectrum onto an explicit raster [start_pos, end_pos]") + .def("raster_align", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSChromatogram& chromatogram, double start_pos, double end_pos) { return self.raster_align(chromatogram, start_pos, end_pos); }, "chromatogram"_a, "start_pos"_a, "end_pos"_a, "Resamples an MSChromatogram onto an explicit raster [start_pos, end_pos]") + .def("rasterExperiment", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples all spectra in an MSExperiment") ; def_ProgressLogger(linearresampleralign_class); diff --git a/src/pyOpenMS/tests/unittests/test000.py b/src/pyOpenMS/tests/unittests/test000.py index 13aa9c4ee33..d06be67ef0c 100644 --- a/src/pyOpenMS/tests/unittests/test000.py +++ b/src/pyOpenMS/tests/unittests/test000.py @@ -1853,17 +1853,17 @@ def testItraqConstants(): @report -def testLinearResampler(): +def testLinearResamplerAlign(): """ - @tests: LinearResampler - LinearResampler.__init__ + @tests: LinearResamplerAlign + LinearResamplerAlign.__init__ """ - ff = pyopenms.LinearResampler() + ff = pyopenms.LinearResamplerAlign() p = ff.getDefaults() _testParam(p) - assert pyopenms.LinearResampler().raster is not None - assert pyopenms.LinearResampler().rasterExperiment is not None + assert pyopenms.LinearResamplerAlign().raster is not None + assert pyopenms.LinearResamplerAlign().rasterExperiment is not None @report def testPeptideAndProteinQuant(): diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index f0dadadc7af..dfb10a50cb5 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -346,7 +346,6 @@ set(filtering_executables_list GaussFilterAlgorithm_test IDFilter_test InternalCalibration_test - LinearResampler_test LinearResamplerAlign_test LowessSmoothing_test MassTraceDetection_test diff --git a/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp b/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp index 9ea5cb3d7c0..7f6a419fbed 100644 --- a/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp +++ b/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp @@ -3,19 +3,20 @@ // // -------------------------------------------------------------------------- // $Maintainer: Hannes Roest $ -// $Authors: Hannes Roest $ +// $Authors: Hannes Roest, Luis Jacob Keller, Alen Saric$ // -------------------------------------------------------------------------- #include -#include -#include #include -#include #include +#include +#include #include +#include + using namespace OpenMS; using namespace std; @@ -55,6 +56,19 @@ input_spectrum[3].setIntensity(2.0f); input_spectrum[4].setMZ(1.8); input_spectrum[4].setIntensity(1.0f); +START_SECTION(LinearResamplerAlign()) +{ + LinearResamplerAlign lr; + TEST_NOT_EQUAL(&lr, static_cast(nullptr)); +} +END_SECTION + +START_SECTION((~LinearResamplerAlign())) +{ + LinearResamplerAlign lr; +} +END_SECTION + // A spacing of 0.75 will lead to a recalculation of intensities, each // resampled point gets intensities from raw data points that are at most +/- // spacing away. @@ -112,7 +126,46 @@ START_SECTION([EXTRA] test_linear_res_chromat) } END_SECTION -START_SECTION(( void raster(ConstPeakTypeIterator raw_it, ConstPeakTypeIterator raw_end, PeakTypeIterator resample_it, PeakTypeIterator resample_end))) +START_SECTION(( template void rasterExperiment(MSExperiment& exp))) +{ + MSSpectrum spec; + spec.resize(5); + spec[0].setMZ(0); + spec[0].setIntensity(3.0f); + spec[1].setMZ(0.5); + spec[1].setIntensity(6.0f); + spec[2].setMZ(1.); + spec[2].setIntensity(8.0f); + spec[3].setMZ(1.6); + spec[3].setIntensity(2.0f); + spec[4].setMZ(1.8); + spec[4].setIntensity(1.0f); + + PeakMap exp; + exp.addSpectrum(spec); + exp.addSpectrum(spec); + + LinearResamplerAlign lr; + Param param; + param.setValue("spacing",0.5); + lr.setParameters(param); + lr.rasterExperiment(exp); + + + for (Size s=0; s -#include - - -#include - -#include -#include - -/////////////////////////// - -START_TEST(LinearResampler, "$Id$") - -///////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////// - -using namespace OpenMS; - -LinearResampler* lr_ptr = nullptr; -LinearResampler* lr_nullPointer = nullptr; - -START_SECTION((LinearResampler())) - lr_ptr = new LinearResampler; - TEST_NOT_EQUAL(lr_ptr, lr_nullPointer); -END_SECTION - -START_SECTION((~LinearResampler())) - delete lr_ptr; -END_SECTION - -START_SECTION((template void raster(MSSpectrum& spectrum))) -{ - MSSpectrum spec; - spec.resize(5); - spec[0].setMZ(0); - spec[0].setIntensity(3.0f); - spec[1].setMZ(0.5); - spec[1].setIntensity(6.0f); - spec[2].setMZ(1.); - spec[2].setIntensity(8.0f); - spec[3].setMZ(1.6); - spec[3].setIntensity(2.0f); - spec[4].setMZ(1.8); - spec[4].setIntensity(1.0f); - - LinearResampler lr; - Param param; - param.setValue("spacing",0.5); - lr.setParameters(param); - lr.raster(spec); - - double sum = 0.0; - for (Size i=0; i> raw points (regression test for index bug) -{ - // 3 raw points spanning a wide range, resampled at fine spacing - // This triggers the bug where right_index was clamped to number_raw_points-1 - // instead of number_resampled_points-1 - MSSpectrum spec; - spec.resize(3); - spec[0].setMZ(0); - spec[0].setIntensity(10.0f); - spec[1].setMZ(5.0); - spec[1].setIntensity(20.0f); - spec[2].setMZ(10.0); - spec[2].setIntensity(10.0f); - - LinearResampler lr; - Param param; - param.setValue("spacing", 0.1); - lr.setParameters(param); - lr.raster(spec); - - // resampled grid: 101 points (ceil((10-0)/0.1 + 1)) - TEST_EQUAL(spec.size(), 101); - - // total intensity must be conserved - double sum = 0.0; - for (Size i = 0; i < spec.size(); ++i) - { - sum += spec[i].getIntensity(); - } - TEST_REAL_SIMILAR(sum, 40.0); - - // no resampled point should exceed the max raw intensity - for (Size i = 0; i < spec.size(); ++i) - { - TEST_EQUAL(spec[i].getIntensity() <= 20.0 + 1e-5, true); - } -} -END_SECTION - -START_SECTION(( template void rasterExperiment(MSExperiment& exp))) -{ - MSSpectrum spec; - spec.resize(5); - spec[0].setMZ(0); - spec[0].setIntensity(3.0f); - spec[1].setMZ(0.5); - spec[1].setIntensity(6.0f); - spec[2].setMZ(1.); - spec[2].setIntensity(8.0f); - spec[3].setMZ(1.6); - spec[3].setIntensity(2.0f); - spec[4].setMZ(1.8); - spec[4].setIntensity(1.0f); - - PeakMap exp; - exp.addSpectrum(spec); - exp.addSpectrum(spec); - - LinearResampler lr; - Param param; - param.setValue("spacing",0.5); - lr.setParameters(param); - lr.rasterExperiment(exp); - - - for (Size s=0; s Date: Thu, 16 Apr 2026 21:12:06 +0200 Subject: [PATCH 10/20] contrib: Treat as a submodule so we don't release it separately (#9155) We want to use the submodule revision hash as the way to connect an OpenMS release to a version of contrib. By doing this we make releasing OpenMS easier because we don't have to tag/release contrib. Yes, I know this is slower for building on Windows. But contrib is going away in the next couple of months. I put the contrib build in its own job so that we don't hit run-time limits. --- .github/workflows/openms_ci_matrix_full.yml | 92 +++++++++++++++------ 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/.github/workflows/openms_ci_matrix_full.yml b/.github/workflows/openms_ci_matrix_full.yml index 73afde4ee31..06db1e2541f 100644 --- a/.github/workflows/openms_ci_matrix_full.yml +++ b/.github/workflows/openms_ci_matrix_full.yml @@ -38,7 +38,60 @@ on: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: + # NOTE: This job is temporary until we get rid of contrib. + prepare-contrib-for-windows: + runs-on: windows-2025 + steps: + - name: Clone OpenMS + uses: actions/checkout@v6 + - name: Set up Visual Studio shell + uses: egor-tensin/vs-shell@v2 + with: { arch: x64 } + + - name: Identify contrib submodule revision for cache key + id: contrib-rev + shell: bash + run: | + rev=$(git submodule status --cached -- contrib | sed -E 's/^-?//' | cut -d' ' -f1) + echo "rev=$rev" >> "$GITHUB_OUTPUT" + + - name: Cache contrib + id: cache-contrib + uses: actions/cache@v5 + with: + path: contrib/build + key: ${{ runner.os }}-${{ steps.contrib-rev.outputs.rev }} + + - name: Fetch contrib if not in cache + if: steps.cache-contrib.outputs.cache-hit != 'true' + shell: bash + run: | + git submodule update --init -- contrib + mkdir -p contrib/build + + - name: Build contrib if not in cache + if: steps.cache-contrib.outputs.cache-hit != 'true' + shell: pwsh + run: | + $NPROC = $env:NUMBER_OF_PROCESSORS + if (-not $NPROC) { $NPROC = 4 } + $env:CMAKE_BUILD_PARALLEL_LEVEL = $NPROC + Set-Location "${{ github.workspace }}\contrib\build" + cmake -G"Visual Studio 17 2022" -Ax64 -DBUILD_TYPE=ALL "${{ github.workspace }}\contrib" + if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE } + + - name: Upload Contrib + uses: actions/upload-artifact@v7 + with: + name: contrib-${{ steps.contrib-rev.outputs.rev }} + path: | + contrib/build/lib/ + contrib/build/bin/ + contrib/build/include/ + retention-days: 7 + build-and-test: + needs: prepare-contrib-for-windows strategy: fail-fast: false matrix: @@ -326,28 +379,6 @@ jobs: echo "cmake_prefix=$(brew --prefix);$(brew --prefix libomp)" >>"$GITHUB_OUTPUT" fi - - name: Cache contrib (Windows) - if: startsWith(matrix.os, 'windows') - id: cache-contrib - uses: actions/cache@v4 - with: - path: ${{ github.workspace }}/OpenMS/contrib - key: ${{ runner.os }}-contrib4 - - - name: Download contrib build from archive (Windows) - if: startsWith(matrix.os, 'windows') && steps.cache-contrib.outputs.cache-hit != 'true' - shell: bash - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - cd OpenMS/contrib - # Download the file using the URL fetched from GitHub - gh release download -R OpenMS/contrib --pattern 'contrib_build-Windows.tar.gz' - # Extract the archive - 7z x -so contrib_build-Windows.tar.gz | 7z x -si -ttar - rm contrib_build-Windows.tar.gz - ls -la - - name: Build libcurl from source (Windows) if: startsWith(matrix.os, 'windows') shell: bash @@ -378,6 +409,21 @@ jobs: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.compiler }}-${{ matrix.compiler_ver }}-ccache-develop ${{ runner.os }}-${{ runner.arch }}-${{ matrix.compiler }}-${{ matrix.compiler_ver }}-ccache- + - name: Identify contrib submodule revision for cache key + id: contrib-rev + shell: bash + run: | + cd OpenMS + rev=$(git submodule status --cached -- contrib | sed -E 's/^-?//' | cut -d' ' -f1) + echo "rev=$rev" >> "$GITHUB_OUTPUT" + + - name: Download contrib artifact for Windows + if: runner.os == 'Windows' + uses: actions/download-artifact@v4 + with: + name: contrib-${{ steps.contrib-rev.outputs.rev }} + path: OpenMS/contrib/build + - name: Add THIRDPARTY shell: bash run: | @@ -453,7 +499,7 @@ jobs: PYOPENMS: "${{ startsWith(matrix.os, 'ubuntu') && matrix.compiler == 'g++' && 'ON' || 'OFF' }}" NO_DEPENDENCIES: "${{ startsWith(matrix.os, 'ubuntu') && matrix.compiler == 'g++' && 'OFF' || 'ON' }}" CMAKE_PREFIX_PATH: "${{ steps.tools-prefix.outputs.cmake_prefix }};${{ steps.conda-setup.outputs.cmake_prefix }}" - OPENMS_CONTRIB_LIBS: "${{ github.workspace }}/OpenMS/contrib" + OPENMS_CONTRIB_LIBS: "${{ github.workspace }}/OpenMS/contrib/build" CI_PROVIDER: "GitHub-Actions" CMAKE_GENERATOR: "Ninja" SOURCE_DIRECTORY: "${{ github.workspace }}/OpenMS" From 4e1703bc207ebcc466d336d0bdd9eff003e67951 Mon Sep 17 00:00:00 2001 From: Adithya Kammati Date: Fri, 17 Apr 2026 00:53:41 +0530 Subject: [PATCH 11/20] Fix outdated links in contributing guide --- CONTRIBUTING.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 72ec8d41807..3ad9ff6f870 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,12 +29,13 @@ If you are an official OpenMS team member: # Opening a Pull Request -Before getting started we recommend taking a look at our GitHub-Wiki: https://github.com/OpenMS/OpenMS/wiki#-for-developers +Before getting started we recommend taking a look at the developer guide: +https://openms.readthedocs.io/en/latest/manual/develop.html Before you open the pull request, make sure you - - adhere to [our coding conventions](https://github.com/OpenMS/OpenMS/wiki/Coding-conventions) - - have [unit tests and functional tests](https://github.com/OpenMS/OpenMS/wiki/Write-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) - - Have [proper documentation](https://github.com/OpenMS/OpenMS/wiki/Coding-conventions#doxygen) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) + - adhere to [our coding conventions](https://openms.readthedocs.io/en/latest/manual/develop.html#coding-conventions) + - have [unit tests and functional tests](https://openms.readthedocs.io/en/latest/manual/develop.html#automated-unit-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) + - Have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) - have Python bindings — nanobind binding files are in `src/pyOpenMS/bindings/`; see `src/pyOpenMS/CLAUDE.md` for wrapping instructions A core developer will review your changes to the main development branch (develop) and approve them (or ask for modifications). You may indicate the prefered reviewer(s) by adding links to them in a comment section (e.g., @cbielow @hendrikweisser @hroest @jpfeuffer @timosachsenberg) From 9a8925da0fb8f639659ec45f7c0fbac58378f758 Mon Sep 17 00:00:00 2001 From: Julianus Pfeuffer <8102638+jpfeuffer@users.noreply.github.com> Date: Thu, 16 Apr 2026 21:31:41 +0200 Subject: [PATCH 12/20] Update CONTRIBUTING.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- CONTRIBUTING.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ad9ff6f870..655ee2b5268 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,9 +35,8 @@ https://openms.readthedocs.io/en/latest/manual/develop.html Before you open the pull request, make sure you - adhere to [our coding conventions](https://openms.readthedocs.io/en/latest/manual/develop.html#coding-conventions) - have [unit tests and functional tests](https://openms.readthedocs.io/en/latest/manual/develop.html#automated-unit-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) - - Have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) + - Have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) - have Python bindings — nanobind binding files are in `src/pyOpenMS/bindings/`; see `src/pyOpenMS/CLAUDE.md` for wrapping instructions - A core developer will review your changes to the main development branch (develop) and approve them (or ask for modifications). You may indicate the prefered reviewer(s) by adding links to them in a comment section (e.g., @cbielow @hendrikweisser @hroest @jpfeuffer @timosachsenberg) Also consider getting in contact with the core developers early. They might provide additional guidance and valuable information on how your specific aim is achieved. This might give you a head start in, for example, developing novel tools or algorithms. From 622398f42603c530e7fd04450384cf3617faf35e Mon Sep 17 00:00:00 2001 From: Julianus Pfeuffer <8102638+jpfeuffer@users.noreply.github.com> Date: Thu, 16 Apr 2026 21:35:44 +0200 Subject: [PATCH 13/20] Fix capitalization in documentation requirements --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 655ee2b5268..6655e135d0c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,7 +35,7 @@ https://openms.readthedocs.io/en/latest/manual/develop.html Before you open the pull request, make sure you - adhere to [our coding conventions](https://openms.readthedocs.io/en/latest/manual/develop.html#coding-conventions) - have [unit tests and functional tests](https://openms.readthedocs.io/en/latest/manual/develop.html#automated-unit-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) - - Have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) + - have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) - have Python bindings — nanobind binding files are in `src/pyOpenMS/bindings/`; see `src/pyOpenMS/CLAUDE.md` for wrapping instructions A core developer will review your changes to the main development branch (develop) and approve them (or ask for modifications). You may indicate the prefered reviewer(s) by adding links to them in a comment section (e.g., @cbielow @hendrikweisser @hroest @jpfeuffer @timosachsenberg) From 6bb28b749c9ef9bf78d8603b2b46901fea6a1644 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Thu, 16 Apr 2026 21:46:14 +0200 Subject: [PATCH 14/20] feat: Bruker DDA native ID + load_ms1 toggle + CometAdapter mzParser workaround (#9160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Bruker DDA native ID + load_ms1 toggle Two related BrukerTimsFile changes: 1. DDA MS2 native IDs change from "scan=" to "frame= scan= precursor=". Matches the DIA MS2 convention and carries all three physical identifiers (source frame, TIMS isolation-window start, Bruker Precursors.Id). Unblocks downstream cross-referencing and is the format mzParser/Comet require a workaround for (handled in CometAdapter). 2. New Config::load_ms1 bool (default true) + -bruker:load_ms1 CLI toggle. When false, MS1 spectra are not emitted to the MSExperiment or streaming consumer; applies to loadDIA_, loadDDA_, loadDIAStreaming, and loadFrames_ level==1. Partial-config warning fires if load_ms1=false combined with ms1_n_neighbors>0 (inconsistent). Two new test sections: - "DDA native ID format test" — asserts native ID carries frame=, scan=, precursor= components and starts with frame=. - "Bruker load_ms1=false test" — asserts zero MS1 and unchanged MS2 count for DDA, DIA, and FRAME export modes. Co-Authored-By: Claude Opus 4.6 (1M context) * feat(CometAdapter): workaround mzParser segfault on Bruker DDA mzML Comet's bundled mzParser (MSToolkit/saxmzmlhandler.cpp) grep-matches native IDs for "scan=" and uses atoi of the following digits as its spectrum-index sort key. When Bruker DDA MS2 IDs of the form "frame=F scan=S precursor=P" are written alongside MS1s that take the counter path ("frame=F"), scanNum values can collide — and cindex::compare() in MSToolkit/include/mzParser.h returns true for equal elements, violating std::sort's strict-weak-ordering invariant. The resulting UB manifests as a segfault during std::string shuffling in the sort. Workaround (upstream fix for cindex::compare should still be filed against UWPR/Comet separately): 1. Before writing the temp mzML fed to Comet, rewrite every native ID to "index=N" (monotonic counter). mzParser routes these through its counter path — no sort, no UB. Original ID is preserved as a per-spectrum MetaValue. 2. Let Comet run normally; it emits pep.xml with the rewritten IDs. 3. After IM/FAIMS annotation (which operates on the rewritten IDs in both pep.xml and exp), translate PSM spectrum_reference values back to the original Bruker IDs via a rewritten→original map. Also: default bruker_config.load_ms1=false when searching MS2. Comet only needs MS2; loading MS1 just wastes memory and time. Can be overridden via -bruker:load_ms1=true if the caller needs MS1 available in `exp` for custom post-processing. Co-Authored-By: Claude Opus 4.6 (1M context) * docs: CHANGELOG for Bruker DDA native ID + load_ms1 + Comet workaround Co-Authored-By: Claude Opus 4.6 (1M context) * fix: keep DDA native ID CV-compliant (MS:1002818) CodeRabbit review flagged that "frame=F scan=S precursor=P" violates the PSI-MS CV term MS:1002818 ("Bruker TDF nativeID format"), which mandates exactly "frame= scan=" — and BrukerTimsFile advertises that accession at BrukerTimsFile.cpp:1297-1298. Revert to the strict pattern and move the Bruker Precursors.Id SQL key out of the nativeID into a per-spectrum MetaValue "bruker_precursor_id". Downstream code that needs the SQL key reads the MetaValue; the native ID stays CV-compliant and mzParser-stable. Test updated to assert the CV pattern (frame= prefix, no precursor= in ID) and that bruker_precursor_id is present as a MetaValue. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: address CodeRabbit review on PR #9160 - BrukerTimsFile::transform(): respect config.load_ms1 when computing the consumer's expected spectrum count, so setExpectedSize matches what the underlying loadFrames_/loadDIA_/loadDDA_ paths actually emit - CometAdapter: drop the misleading claim that -bruker:load_ms1=true can override the forced MS1 skip (the option isn't registered here) - BrukerTimsFile_test: brace single-line if per OpenMS coding guidelines Co-Authored-By: Claude Opus 4.7 (1M context) * fix(bruker): correctness fixes from review of PR #9160 Three confirmed correctness bugs surfaced during the PR review: 1. BrukerTimsFile::transform() computed the DIA expected-spectrum count as counts.ms2 * windows.size(), but each MS2 frame belongs to exactly ONE WindowGroup. Actual emit count is the per-group sum Σ (frames_in_group × windows_in_group), identical to loadDIA_'s total_work formula. With multiple WindowGroups the previous formula overcounted by a factor proportional to the number of groups. 2. DDA MS2 nativeID collisions. This reader aggregates all PasefFrameMsMsInfo rows sharing the same Precursors.Id (potentially across multiple frames) into ONE spectrum — a timsrust-style design pwiz does NOT use (pwiz emits per-mobility-scan, where (frame, scan) is inherently unique). Under our per-precursor aggregation, first_entry->(frame_id, scan_begin) is NOT guaranteed unique across distinct precursors (PASEF co-isolation with different IsolationMz, multi-cycle precursors colliding on their first entry). Restore the trailing "precursor=

" token to the nativeID. This matches pwiz's combined-mode convention ("merged=N frame=F scanStart=A scanEnd=B" under the same MS:1002818 accession) and OpenMS's own DIA path ("frame=F windowGroup=G scan=S"). The CV accession MS:1002818 stays; Comet/Sage dispatch on nativeID content, not the CV accession, so downstream compatibility is preserved. Precursors.Id remains duplicated as a typed MetaValue "bruker_precursor_id" for direct lookup. Test adds a uniqueness assertion on the full set of DDA MS2 native IDs. 3. SpectrumNativeIDParser did not recognize MS:1002818 (accession-based overload returned -1; content-based overload didn't recognize the "frame=" prefix). Downstream tools relying on scan-number extraction (SpectrumLookup, USI, PepXMLFile, PercolatorInfile, IDMapper, ConsensusMapArrowExport) silently produced -1 or used the bare-int fallback for Bruker .d output. Add MS:1002818 to the "scan=" accession bucket and "frame=" to the prefix map; both route to the existing scan=(\d+) regex which picks up the scan_begin integer. CometAdapter's existing comments at :725 and :853 already described the "frame=F scan=S precursor=P" format and are now correct again without modification. CHANGELOG updated to document the real format and the downstream parser work. rationale for the nativeID format is documented inline in BrukerTimsFile::loadDDA_. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(ProSE): swap cal_lower/cal_upper to match signed-error convention ProSEAlgorithm::runCalibrationPass_ was reflecting the calibration window around zero. Given the (lower, upper) schema established at ProSEAlgorithm.cpp:1615-1617 (theoretical ∈ [observed - lower, observed + upper] ⇔ signed error e ∈ [-upper, +lower]), mapping the intended window [shift - spread, shift + spread] requires: cal_lower = spread + shift // was: spread - shift cal_upper = spread - shift // was: spread + shift The filter at lines 1624-1625 already uses the correct convention; only the write-back at 1667-1668 was inverted. Impact on the Bruker DDA fixture (PR #9160's opentims integration test): on a precursor error distribution biased +3.57 ppm with 6.02 ppm spread, the bug made FragmentIndex accept e ∈ [-9.59, +2.44] (hard-capped, opposite-sign) instead of [-2.44, +9.59]. 1% FDR PSMs went from 1441 (buggy) to 4537 (fixed) — uncalibrated baseline is 4373, so calibration now correctly improves ID yield by ~4% instead of degrading it 3x. Bug introduced in commit 7445cd61e2 (PR #9132, "asymmetric precursor tolerance window"). Pre-existing on develop; surfaced because PR #9160 added a Bruker DDA integration test that exposes the signed precursor bias through which this swap becomes visible. Also adds floor-only PSM-count regression tripwires for the three DDA pipelines (SimpleSearchEngine, ProSE, ProSE calibrated). Floors sit ~5% below observed yields, catching real regressions while tolerating small algorithmic drift. The calibrated floor specifically guards the swap fix — a regression would drop PSMs back to ~1441 and trigger the floor. The check is a pure-CMake -P script (file(READ) + string(REGEX MATCHALL) since file(STRINGS) silently undercounts on large idXML files). Co-Authored-By: Claude Opus 4.7 (1M context) * test(bruker): tighten PSM-count floors to ~0.5% below current Original floors were ~5% below observed counts, too loose to catch a subtle regression (e.g., a partial nativeID or IM-annotation break that drops 2% of PSMs). Tighten to ~0.5%: SimpleSearchEngine DDA: 3900 -> 4170 (current 4187) ProSE DDA: 4100 -> 4350 (current 4373) ProSE DDA calibrated: 4300 -> 4510 (current 4537) If a legitimate change raises yield, the floor should be raised in the same commit. Co-Authored-By: Claude Opus 4.7 (1M context) * test(ProSE): align calibration-asymmetry assertion with fixed convention The calibration asymmetry check in the "calibration preserves asymmetric bias - normal case" section was matched to the previous buggy ordering (cal_upper > cal_lower for positive bias). After fixing the swap in runCalibrationPass_, the correct inequality is cal_lower > cal_upper: under (lower, upper) where signed error e ∈ [-upper, +lower], a positive bias widens +lower and tightens -upper. The order-invariant downstream uses (std::min(cal.cal_lower, cal.cal_upper) on line 1299) are unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) * test(ProSE): strengthen calibration asymmetry-case assertions Previous version used tautological checks (cal_lower < 20.0 is trivially true because the code does std::min(raw, 20.0)) and a way-too-loose 5 ppm tolerance on the shift, so it couldn't actually catch a regression of the sign convention. Replace with: - Direction check: shift > 0 and spread > 0 for this positive-biased fixture - |shift| < spread precondition stated explicitly - Load-bearing functional identities binding the writeback mapping: cal_lower = spread + shift (upper endpoint of signed-e window) cal_upper = spread - shift (|lower endpoint| of signed-e window) These directly encode the (lower, upper) <-> [-upper, +lower] convention and would fail simultaneously if the swap bug were re-introduced. - Asymmetry check (cal_lower > cal_upper for positive bias) as a cross-check against mislabeled shift/spread producing consistent-but-wrong identities. - Keep the "< 20, < 30" checks only as a witness that the std::min cap is inactive (so the identities above are unconstrained), NOT as primary behavior assertions. Dropped the hardcoded shift=7.0 / spread=8.5 / cal_lower=15.5 / cal_upper=1.5 literals: the search pipeline doesn't include every synthetic shift in the calibration set, so those specific numbers depend on search-side filtering details. The functional identities are immune to that. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG | 2 + .../include/OpenMS/FORMAT/BrukerTimsFile.h | 4 + .../source/ANALYSIS/ID/ProSEAlgorithm.cpp | 14 +- src/openms/source/FORMAT/BrukerTimsFile.cpp | 102 +++++++++++--- .../METADATA/SpectrumNativeIDParser.cpp | 20 ++- .../openms/source/BrukerTimsFile_test.cpp | 125 ++++++++++++++++++ .../openms/source/ProSEAlgorithm_test.cpp | 32 +++-- src/tests/topp/CMakeLists.txt | 28 ++++ src/tests/topp/check_psm_floor.cmake | 35 +++++ src/topp/CometAdapter.cpp | 56 +++++++- src/topp/FileConverter.cpp | 5 + 11 files changed, 384 insertions(+), 39 deletions(-) create mode 100644 src/tests/topp/check_psm_floor.cmake diff --git a/CHANGELOG b/CHANGELOG index 91e0731ff84..3258e2b2eb4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -35,6 +35,8 @@ General: - Transparent .zip decompression for XML-based formats (mzML, idXML, etc.) and Bruker .d directories: files with a .zip extension are now decompressed on-the-fly during parsing; .d.zip archives are extracted to a temporary directory and loaded via BrukerTimsFile; no conversion step required (#9139) - Fix: BrukerTimsFile: DDA-PASEF and DIA-PASEF spectra are now marked as CENTROID (PeakType::PEAK) so downstream tools that check spectrum type (e.g. FeatureFinderMultiplex, OpenSwathWorkflow) accept them without extra conversion (#9154) - Bruker TimsTOF MS1 frame aggregation for DIA-PASEF and DDA-PASEF: new BrukerTimsFile::Config fields (ms1_n_neighbors, ms1_min_support, ms1_max_rt_distance_sec, ms1_centroid_max_peaks) sum peaks from 2N+1 adjacent MS1 frames into a sparse (mz_bin, scan_id) grid per center frame, composable with the existing within-frame IM centroiding (ms1_centroid_mz_ppm and ms1_centroid_im_pct). Exposed through FileConverter as -bruker:ms1_n_neighbors, -bruker:ms1_min_support, -bruker:ms1_max_rt_distance_sec, -bruker:ms1_centroid_max_peaks. Disabled by default. Also renames the internal DIAFrameAggregator class to FrameAggregator (scope-agnostic now that MS1 uses it), guards its (mz_bin, scan_id) grid lookups against uint32 wrap at scan_id=0, and switches the skip_denoise predicate in the MS2 aggregation loop from index-span to actual contributing-frame count (closes #9151 TODO). + - BrukerTimsFile: DDA MS2 native IDs changed from "scan=" to "frame= scan= precursor=". The frame/scan pair follows the PSI-MS MS:1002818 ("Bruker TDF nativeID format") convention (scan is the TIMS isolation-window scan_begin); the trailing "precursor=

" token is required for uniqueness under OpenMS's per-Precursors.Id aggregation (different precursors can share the same (frame, scan_begin) via PASEF co-isolation with different IsolationMz, and multi-cycle precursors can collide on their first entry) — this mirrors the convention pwiz uses in its "combined" mode ("merged=N frame=F scanStart=A scanEnd=B"), where extra tokens are added while the SourceFile CV accession remains MS:1002818. The Bruker Precursors.Id is ALSO duplicated as a typed MetaValue "bruker_precursor_id" for direct programmatic access. SpectrumNativeIDParser now recognizes MS:1002818 and the "frame=" prefix so downstream scan-number extraction (SpectrumLookup, USI, PepXMLFile, PercolatorInfile, IDMapper) works for Bruker .d output. New BrukerTimsFile::Config field load_ms1 (CLI: -bruker:load_ms1, default true): disable to skip MS1 loading entirely for MS2-only workflows (peptide database search) — cuts memory and time substantially. CometAdapter automatically sets load_ms1=false when searching MS2. BrukerTimsFile::transform() progress now matches actually-emitted spectra in DIA mode (per-WindowGroup sum, not frames × all_windows) and respects load_ms1 across all modes. + - CometAdapter: workaround for a latent bug in Comet's bundled mzParser (MSToolkit/saxmzmlhandler.cpp) that segfaults when mzML files contain mixed "frame=..." and "scan=..." native IDs with overlapping numeric values — which is exactly the Bruker .d DDA/DIA case. CometAdapter rewrites Bruker .d native IDs to monotonic "index=N" before handing the temp mzML to Comet, and translates PSM spectrum references back to the original Bruker IDs after Comet finishes. Upstream fix for cindex::compare's strict-weak-ordering violation should be filed against UWPR/Comet. Dependencies: - PyOpenMS: Autowrap/Cython replaced with nanobind; nanobind 2.10.0+ fetched automatically (#8699) diff --git a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h index 39d5897a866..cc9ddc14064 100644 --- a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h +++ b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h @@ -52,6 +52,10 @@ namespace OpenMS double calibration_tolerance = 0.0; ///< m/z recalibration tolerance in Da (0 = default 0.1 Da) bool calibrate = false; ///< Enable m/z recalibration (off by default; may fail on some datasets) + bool load_ms1 = true; ///< Load MS1 spectra. Disable (false) for MS2-only workflows + ///< (peptide database search) where MS1 surveys are not needed — + ///< cuts memory and time substantially. Affects all export modes. + float ms1_centroid_mz_ppm = 0.0f; ///< MS1 IM-centroiding m/z tolerance in ppm (0 = disabled, suggested: 5.0). Adapted from Sage (Lazear 2023, doi:10.1021/acs.jproteome.3c00486). float ms1_centroid_im_pct = 0.0f; ///< MS1 IM-centroiding ion mobility tolerance in percent (0 = disabled, suggested: 3.0) int ms1_centroid_max_peaks = 100000; ///< Upper bound on centroided peaks per MS1 spectrum. Caps retention at the top-intensity peaks; low-intensity tail is dropped when the limit is hit (a warning is emitted at that point). Sized for aggregated MS1 where frame-stacked grids commonly exceed 200k peaks; per-frame MS1 rarely hits even 10k. diff --git a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp index e4177f9af89..69a4f0c6c59 100644 --- a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp +++ b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp @@ -1660,12 +1660,16 @@ namespace OpenMS if (!result.extreme_bias) { - // Map signed window [shift - spread, shift + spread] to positive magnitudes: - // -cal_lower = shift - spread -> cal_lower = spread - shift - // +cal_upper = shift + spread -> cal_upper = spread + shift + // Map signed window [shift - spread, shift + spread] to the positive-magnitude + // (lower, upper) schema. Per the convention established above (lines ~1615-1617), + // a (lower, upper) pair means theoretical ∈ [observed - lower, observed + upper], + // equivalently the signed error e = observed - theoretical lies in [-upper, +lower]. + // Matching [shift - spread, shift + spread] against [-upper, +lower] gives: + // -cal_upper = shift - spread -> cal_upper = spread - shift + // +cal_lower = shift + spread -> cal_lower = spread + shift // Both are non-negative when |shift| < spread. - const double cal_lower_raw = result.precursor_spread - prec_shift; - const double cal_upper_raw = result.precursor_spread + prec_shift; + const double cal_lower_raw = result.precursor_spread + prec_shift; + const double cal_upper_raw = result.precursor_spread - prec_shift; // Only tighten - cap against user-configured bounds. result.cal_lower = std::min(cal_lower_raw, precursor_mass_tolerance_lower_); result.cal_upper = std::min(cal_upper_raw, precursor_mass_tolerance_upper_); diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index 4dad814d6fe..05624d5d052 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -381,6 +381,12 @@ namespace OpenMS << ms1_frame_count << "); effective window will be clamped " << "at run edges." << std::endl; } + if (!config.load_ms1 && config.ms1_n_neighbors > 0) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") ignored: load_ms1=false disables MS1 loading entirely." + << std::endl; + } } // ===================================================================== @@ -1388,8 +1394,9 @@ namespace OpenMS warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); - startProgress(0, ms1_frame_ids.size(), "Streaming DIA-PASEF MS1 frames"); - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + startProgress(0, ms1_to_emit, "Streaming DIA-PASEF MS1 frames"); + for (size_t i = 0; i < ms1_to_emit; ++i) { MSSpectrum spec; if (config.ms1_n_neighbors > 0) @@ -1566,24 +1573,40 @@ namespace OpenMS bool is_dia = isDIA(db); Config::ExportMode mode = config.export_mode; - // Compute expected size for consumer + // Compute expected size for consumer. Must mirror what loadFrames_/loadDIA_/ + // loadDDA_ actually emit, which skip MS1 entirely when load_ms1=false. FrameCounts counts = readFrameCounts(db); + const size_t ms1_emitted = config.load_ms1 ? counts.ms1 : 0; size_t expected = 0; if (mode == Config::FRAME) { - expected = counts.total; + expected = ms1_emitted + counts.ms2; } else if (is_dia && mode != Config::SPECTRUM) { - // DIA: MS1 frames + MS2 frames * windows + // DIA: each MS2 frame belongs to exactly ONE WindowGroup and only + // produces spectra for that group's windows. So the emit count is + // Σ_group (frames_in_group × windows_in_group) + // — NOT frames × all_windows. Mirrors the total_work computation in + // loadDIA_. auto windows = readDIAWindows(db, *handle->scan2inv_ion_mobility_converter); - expected = counts.ms1 + static_cast(counts.ms2) * windows.size(); + auto group_to_frames = readFrameToWindowGroupMapping(db, windows); + std::map group_window_count; + for (const auto& w : windows) ++group_window_count[w.window_group]; + size_t dia_ms2_spectra = 0; + for (const auto& [group, frames] : group_to_frames) + { + auto wit = group_window_count.find(group); + if (wit != group_window_count.end()) + dia_ms2_spectra += frames.size() * wit->second; + } + expected = ms1_emitted + dia_ms2_spectra; } else { // DDA: MS1 frames + per-precursor MS2 spectra uint32_t num_precursors = countDDAPrecursors(db); - expected = counts.ms1 + num_precursors; + expected = ms1_emitted + num_precursors; } consumer->setExpectedSize(expected, 0); @@ -1646,15 +1669,16 @@ namespace OpenMS } uint32_t num_ms2 = static_cast(precursor_groups.size()); - exp.reserveSpaceSpectra(static_cast(ms1_frame_ids.size()) + num_ms2); + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + exp.reserveSpaceSpectra(static_cast(ms1_to_emit) + num_ms2); - startProgress(0, ms1_frame_ids.size() + num_ms2, "Loading DDA-PASEF data"); + startProgress(0, ms1_to_emit + num_ms2, "Loading DDA-PASEF data"); // --- MS1 frames --- FrameCentroider centroider; FrameAggregator ms1_aggregator(0.01); - if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + if (config.load_ms1 && config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); + for (size_t i = 0; i < ms1_to_emit; ++i) { MSSpectrum spec; if (config.ms1_n_neighbors > 0) @@ -1866,7 +1890,7 @@ namespace OpenMS if (all_tofs.empty()) { ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); continue; } @@ -1946,7 +1970,7 @@ namespace OpenMS if (all_mz.empty()) { ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); continue; } @@ -1976,7 +2000,42 @@ namespace OpenMS spec.setDriftTime(scalar_im); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); spec.setType(SpectrumSettings::SpectrumType::CENTROID); - spec.setNativeID("scan=" + String(prec_id)); + // Native ID: "frame= scan= precursor=

". + // + // This extends the PSI-MS MS:1002818 "Bruker TDF nativeID format" + // pattern ("frame= scan=") with a trailing + // "precursor=" token. The extension is REQUIRED for + // uniqueness under OpenMS's per-precursor aggregation: + // + // - Unlike pwiz/msconvert (which emits one MS2 spectrum per + // Bruker PASEF mobility-scan row, where (frame, scan_begin) is + // inherently unique), this reader ports timsrust's strategy of + // merging all PasefFrameMsMsInfo entries sharing the same + // Precursors.Id (potentially across multiple frames) into ONE + // aggregated MS2 spectrum — see the loop above and commit + // 41ce6cfeb (PR #8975). first_entry->(frame_id, scan_begin) is + // just the min-ordered row of that group and is NOT guaranteed + // unique across distinct precursors (PASEF can co-isolate + // multiple precursors in the same (Frame, ScanNumBegin) mobility + // window differing only by IsolationMz; multi-cycle precursors + // can also collide on their first entry). + // + // - Strict MS:1002818 validators may flag the trailing token, but + // the ecosystem convention (pwiz combined mode uses + // "merged=N frame=F scanStart=A scanEnd=B" under the SAME + // MS:1002818 accession; our own DIA path emits + // "frame=F windowGroup=G scan=S") is that vendor-specific + // disambiguators are added while keeping MS:1002818. Comet and + // Sage dispatch on nativeID content, not the CV accession, so + // compatibility is preserved in practice. + // + // - Bruker Precursors.Id is also duplicated as a typed MetaValue + // "bruker_precursor_id" for direct programmatic lookup without + // parsing the nativeID string. + spec.setNativeID("frame=" + String(first_entry->frame_id) + + " scan=" + String(first_entry->scan_begin) + + " precursor=" + String(prec_id)); + spec.setMetaValue("bruker_precursor_id", static_cast(prec_id)); // Copy sorted peak data spec.reserve(all_mz.size()); @@ -2011,7 +2070,7 @@ namespace OpenMS exp.addSpectrum(std::move(spec)); ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); } endProgress(); centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); @@ -2040,9 +2099,10 @@ namespace OpenMS // --- MS1 frames --- FrameCentroider centroider; FrameAggregator ms1_aggregator(0.01); // finer bin for MS1 (see design spec) - if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); - startProgress(0, ms1_frame_ids.size(), "Loading DIA-PASEF MS1 frames"); - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + if (config.load_ms1 && config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); + startProgress(0, ms1_to_emit, "Loading DIA-PASEF MS1 frames"); + for (size_t i = 0; i < ms1_to_emit; ++i) { MSSpectrum spec; if (config.ms1_n_neighbors > 0) @@ -2059,7 +2119,8 @@ namespace OpenMS setProgress(i); } endProgress(); - centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); + if (config.load_ms1) + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); // --- SWATH windows --- std::vector windows = readDIAWindows(db, *handle.scan2inv_ion_mobility_converter); @@ -2287,6 +2348,9 @@ namespace OpenMS // Iterate all frames at each MS level for (int level = 1; level <= 2; ++level) { + // Honor load_ms1: skip level==1 entirely when the user opts out of MS1. + if (level == 1 && !config.load_ms1) continue; + // Collect frame IDs at this level std::vector frame_ids; for (uint32_t fid = handle.min_frame_id(); fid <= handle.max_frame_id(); ++fid) diff --git a/src/openms/source/METADATA/SpectrumNativeIDParser.cpp b/src/openms/source/METADATA/SpectrumNativeIDParser.cpp index b592974e7f3..220ded5e29b 100644 --- a/src/openms/source/METADATA/SpectrumNativeIDParser.cpp +++ b/src/openms/source/METADATA/SpectrumNativeIDParser.cpp @@ -21,7 +21,8 @@ namespace OpenMS { return id.hasPrefix("scan=") || id.hasPrefix("scanId=") || id.hasPrefix("scanID=") || id.hasPrefix("controllerType=") || id.hasPrefix("function=") || id.hasPrefix("sample=") - || id.hasPrefix("index=") || id.hasPrefix("spectrum=") || id.hasPrefix("file="); + || id.hasPrefix("index=") || id.hasPrefix("spectrum=") || id.hasPrefix("file=") + || id.hasPrefix("frame="); } std::string SpectrumNativeIDParser::getRegExFromNativeID(const String& id) @@ -29,9 +30,15 @@ namespace OpenMS // "scan=NUMBER" e.g. Bruker/Agilent // "controllerType=0 controllerNumber=1 scan=NUMBER" for Thermo // "function= process= scan=NUMBER" for Waters + // "frame=FRAME_ID scan=SCAN_ID [precursor=PREC_ID]" for Bruker TDF + // (MS:1002818). Extra trailing tokens (precursor=, windowGroup=, + // scanStart=, scanEnd=, merged=) are used by OpenMS DDA/DIA PASEF + // and pwiz combined mode; the regex targets the "scan=" token + // which remains the most meaningful scan-number proxy. if (id.hasPrefix("scan=") || id.hasPrefix("controllerType=") - || id.hasPrefix("function=")) return std::string(R"(scan=(?\d+))"); + || id.hasPrefix("function=") + || id.hasPrefix("frame=")) return std::string(R"(scan=(?\d+))"); // "index=NUMBER" if (id.hasPrefix("index=")) return std::string(R"(index=(?\d+))"); @@ -83,8 +90,13 @@ namespace OpenMS { // check accession for data type to extract (e.g. MS:1000768 - Thermo nativeID format - scan=xsd:positiveInteger) boost::regex regexp; - // list of CV accessions with native id format "scan=NUMBER" - std::vector scan = {"MS:1000768","MS:1000769","MS:1000771","MS:1000772","MS:1000776"}; + // list of CV accessions with native id format "scan=NUMBER". + // MS:1002818 = "Bruker TDF nativeID format" (pattern "frame= scan=", + // we extend it with a trailing "precursor=" for aggregated DDA + // output — see BrukerTimsFile::loadDDA_ for the rationale). The regex + // targets the "scan=" token in all cases; trailing tokens are + // ignored. + std::vector scan = {"MS:1000768","MS:1000769","MS:1000771","MS:1000772","MS:1000776","MS:1002818"}; // list of CV accession with native id format "file=NUMBER" std::vector file = {"MS:1000773","MS:1000775"}; // expected number of subgroups diff --git a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp index bd2f1ba97b6..ba22eae131b 100644 --- a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp +++ b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp @@ -260,6 +260,131 @@ START_SECTION(DDA loading integration test) } END_SECTION +START_SECTION(DDA native ID format test) +{ + // Contract: DDA MS2 native IDs are "frame= scan= precursor=

". + // This extends the MS:1002818 pattern with a trailing "precursor=

" + // token because OpenMS aggregates all PasefFrameMsMsInfo entries sharing + // the same Precursors.Id into ONE spectrum (pwiz emits per-mobility-scan, + // so its (frame, scan_begin) pairs are inherently unique — ours are not). + // The Precursors.Id is ALSO duplicated as MetaValue "bruker_precursor_id" + // for typed programmatic access. + BrukerTimsFile f; + MSExperiment exp; + f.load(OPENTIMS_DDA_TEST_DATA, exp); + + // Find a non-empty MS2 spectrum + const MSSpectrum* ms2 = nullptr; + for (const auto& spec : exp) + { + if (spec.getMSLevel() == 2 && !spec.empty()) + { + ms2 = &spec; + break; + } + } + TEST_NOT_EQUAL(ms2, nullptr); + + const String& id = ms2->getNativeID(); + TEST_EQUAL(id.hasPrefix("frame="), true); + TEST_EQUAL(id.hasSubstring(" scan="), true); + TEST_EQUAL(id.hasSubstring(" precursor="), true); + + // Precursors.Id must ALSO be stored as a typed MetaValue. + TEST_EQUAL(ms2->metaValueExists("bruker_precursor_id"), true); + + // All DDA MS2 native IDs must be unique inside the run (XSD-level mzML + // requirement). This is what the "precursor=

" disambiguator buys us. + std::set ms2_ids; + Size ms2_count = 0; + for (const auto& spec : exp) + { + if (spec.getMSLevel() == 2) + { + ms2_ids.insert(spec.getNativeID()); + ++ms2_count; + } + } + TEST_EQUAL(ms2_ids.size(), ms2_count); + + STATUS("DDA MS2 native ID sample: " << id + << " bruker_precursor_id=" << ms2->getMetaValue("bruker_precursor_id").toString()); +} +END_SECTION + +START_SECTION(Bruker load_ms1=false test) +{ + BrukerTimsFile f; + + // Baseline: default config loads both MS1 and MS2 + MSExperiment exp_both; + f.load(OPENTIMS_DDA_TEST_DATA, exp_both); + Size raw_ms1 = 0, raw_ms2 = 0; + for (const auto& s : exp_both) + { + if (s.getMSLevel() == 1) ++raw_ms1; + else if (s.getMSLevel() == 2) ++raw_ms2; + } + TEST_NOT_EQUAL(raw_ms1, 0); + TEST_NOT_EQUAL(raw_ms2, 0); + + // load_ms1=false: zero MS1, MS2 count unchanged + BrukerTimsFile::Config cfg; + cfg.load_ms1 = false; + MSExperiment exp_ms2_only; + f.load(OPENTIMS_DDA_TEST_DATA, exp_ms2_only, cfg); + Size skip_ms1 = 0, skip_ms2 = 0; + for (const auto& s : exp_ms2_only) + { + if (s.getMSLevel() == 1) ++skip_ms1; + else if (s.getMSLevel() == 2) ++skip_ms2; + } + TEST_EQUAL(skip_ms1, 0); + TEST_EQUAL(skip_ms2, raw_ms2); + + STATUS("DDA load_ms1=false: raw MS1=" << raw_ms1 << " MS2=" << raw_ms2 + << " | skipped MS1=" << skip_ms1 << " MS2=" << skip_ms2); + +#ifdef OPENTIMS_DIA_TEST_DATA + // Same invariant on DIA + MSExperiment exp_dia_both; + f.load(OPENTIMS_DIA_TEST_DATA, exp_dia_both); + Size dia_raw_ms1 = 0, dia_raw_ms2 = 0; + for (const auto& s : exp_dia_both) + { + if (s.getMSLevel() == 1) ++dia_raw_ms1; + else if (s.getMSLevel() == 2) ++dia_raw_ms2; + } + + BrukerTimsFile::Config cfg_dia; + cfg_dia.load_ms1 = false; + MSExperiment exp_dia_skip; + f.load(OPENTIMS_DIA_TEST_DATA, exp_dia_skip, cfg_dia); + Size dia_skip_ms1 = 0, dia_skip_ms2 = 0; + for (const auto& s : exp_dia_skip) + { + if (s.getMSLevel() == 1) ++dia_skip_ms1; + else if (s.getMSLevel() == 2) ++dia_skip_ms2; + } + TEST_EQUAL(dia_skip_ms1, 0); + TEST_EQUAL(dia_skip_ms2, dia_raw_ms2); + + // FRAME export mode must also honor load_ms1 (skips level==1 loop) + BrukerTimsFile::Config cfg_frame; + cfg_frame.export_mode = BrukerTimsFile::Config::FRAME; + cfg_frame.load_ms1 = false; + MSExperiment exp_frame; + f.load(OPENTIMS_DIA_TEST_DATA, exp_frame, cfg_frame); + Size frame_ms1 = 0; + for (const auto& s : exp_frame) + { + if (s.getMSLevel() == 1) ++frame_ms1; + } + TEST_EQUAL(frame_ms1, 0); +#endif +} +END_SECTION + START_SECTION(DDA round-trip test: load .d -> write mzML -> reload -> verify) { // Load from .d diff --git a/src/tests/class_tests/openms/source/ProSEAlgorithm_test.cpp b/src/tests/class_tests/openms/source/ProSEAlgorithm_test.cpp index 3db2167b3ba..890fe6201e9 100644 --- a/src/tests/class_tests/openms/source/ProSEAlgorithm_test.cpp +++ b/src/tests/class_tests/openms/source/ProSEAlgorithm_test.cpp @@ -1242,16 +1242,32 @@ START_SECTION(([EXTRA] calibration preserves asymmetric bias - normal case)) const auto& cal = algo.last_calibration_result_; TEST_EQUAL(cal.success, true) TEST_EQUAL(cal.extreme_bias, false) - // Shift should be close to +7 ppm. The fixture is small and Math::median on - // an even count averages the two middle elements so we allow +/- 5 ppm slack. - TOLERANCE_ABSOLUTE(5.0) - TEST_REAL_SIMILAR(cal.precursor_shift, 7.0) - TOLERANCE_ABSOLUTE(1e-5) // reset to default - // cal_lower/cal_upper should be tightened from [20, 30] but retain asymmetry. + // Fixture's ppm_shifts are all positive, so the calibration direction must + // come out positive too. Spread is strictly positive by construction. + TEST_EQUAL(cal.precursor_shift > 0.0, true) + TEST_EQUAL(cal.precursor_spread > 0.0, true) + // |shift| < spread is the precondition for the writeback block (extreme_bias + // already asserted false above, but state it as a positive numerical check). + TEST_EQUAL(std::abs(cal.precursor_shift) < cal.precursor_spread, true) + + // Load-bearing check — guards the sign convention in runCalibrationPass_'s + // writeback block. Under (lower, upper) where signed error e lies in + // [-upper, +lower], the calibrated window [shift - spread, shift + spread] + // maps to: + // cal_lower = spread + shift (upper endpoint of the signed window) + // cal_upper = spread - shift (|lower endpoint| of the signed window) + // A regression of the swap would flip both of these. + TEST_REAL_SIMILAR(cal.cal_lower, cal.precursor_spread + cal.precursor_shift) + TEST_REAL_SIMILAR(cal.cal_upper, cal.precursor_spread - cal.precursor_shift) + // Positive bias => cal_lower > cal_upper. A regression of the swap would + // produce cal_upper > cal_lower — the assertion below catches that even + // if the functional identities above are tautologically consistent with a + // mislabeled spread/shift pair. + TEST_EQUAL(cal.cal_lower > cal.cal_upper, true) + // Both tightened from user-configured (20, 30); std::min cap inactive, so + // the functional identities above are unconstrained. TEST_EQUAL(cal.cal_lower < 20.0, true) TEST_EQUAL(cal.cal_upper < 30.0, true) - // And remain asymmetric — the positive bias means cal_upper > cal_lower. - TEST_EQUAL(cal.cal_upper > cal.cal_lower, true) // Post-search, the tolerance members have been RESTORED to the user-configured // values to avoid per-file state leaks in the multi-file wrapper (which reuses a // single ProSEAlgorithm instance across files). The calibrated diff --git a/src/tests/topp/CMakeLists.txt b/src/tests/topp/CMakeLists.txt index 1532b4157d6..c7af5628dd8 100644 --- a/src/tests/topp/CMakeLists.txt +++ b/src/tests/topp/CMakeLists.txt @@ -3000,6 +3000,17 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA" PROPERTIES DEPENDS "TOPP_SimpleSearchEngine_DDA") + # Floor-only PSM-count regression tripwire. Current yield on this fixture is + # 4187; floor set ~0.5% below so tiny tie-breaking jitter is absorbed but any + # real regression (silent native-ID or IM-annotation breakage, thirdparty + # algorithm drift) trips the check. If a legitimate improvement raises yield, + # raise the floor in the same commit. + add_test(NAME "TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/SimpleSearchEngine_DDA_fdr.tmp.idXML 4170) + set_tests_properties("TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA") + add_test("TOPP_ProSE_DDA" ${TOPP_BIN_PATH}/ProSE -test -in ${OPENTIMS_DDA_SYMLINK} -database ${OPENTIMS_TEST_FASTA} @@ -3021,6 +3032,13 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA" PROPERTIES DEPENDS "TOPP_ProSE_DDA") + # Floor: 4373 current; guard ~0.5% below. + add_test(NAME "TOPP_FalseDiscoveryRate_ProSEDDA_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/ProSE_DDA_fdr.tmp.idXML 4350) + set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_ProSEDDA") + # Same DDA search with automatic calibration enabled add_test("TOPP_ProSE_DDA_calibrated" ${TOPP_BIN_PATH}/ProSE -test -in ${OPENTIMS_DDA_SYMLINK} @@ -3044,6 +3062,16 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_calibrated" PROPERTIES DEPENDS "TOPP_ProSE_DDA_calibrated") + # Floor: 4537 current (calibration should legitimately EXCEED uncalibrated yield); + # guard ~0.5% below. This test also protects the sign-convention fix in + # ProSEAlgorithm::runCalibrationPass_ — a regression of the swap would drop + # this count back to ~1441. + add_test(NAME "TOPP_FalseDiscoveryRate_ProSEDDA_calibrated_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/ProSE_DDA_calibrated_fdr.tmp.idXML 4510) + set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_calibrated_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_ProSEDDA_calibrated") + # Generate target-decoy database (shared by CometAdapter and SageAdapter DDA tests) add_test("TOPP_DecoyDatabase_DDA" ${TOPP_BIN_PATH}/DecoyDatabase -test -in ${OPENTIMS_TEST_FASTA} diff --git a/src/tests/topp/check_psm_floor.cmake b/src/tests/topp/check_psm_floor.cmake new file mode 100644 index 00000000000..25ec17a51d2 --- /dev/null +++ b/src/tests/topp/check_psm_floor.cmake @@ -0,0 +1,35 @@ +# Usage: cmake -P check_psm_floor.cmake +# +# Counts occurrences in an idXML file and fails if the count +# is below . Intended as a regression tripwire on PSM yields from the +# Bruker DDA opentims integration tests (1% FDR outputs). Floor-only: legitimate +# improvements that raise the count should NOT fail; only drops do. + +if(NOT DEFINED CMAKE_ARGV3 OR NOT DEFINED CMAKE_ARGV4) + message(FATAL_ERROR "Usage: cmake -P check_psm_floor.cmake ") +endif() + +set(file "${CMAKE_ARGV3}") +set(floor "${CMAKE_ARGV4}") + +if(NOT EXISTS "${file}") + message(FATAL_ERROR "PSM-count input not found: ${file}") +endif() + +# Use file(READ) + string(REGEX MATCHALL) rather than file(STRINGS ... REGEX). +# file(STRINGS) silently drops long lines/entries and produces wrong counts on +# idXML (observed: 186 vs the true 4537) — MATCHALL over the full file content +# is the reliable path. +file(READ "${file}" content) +string(REGEX MATCHALL " +#include + #ifdef WITH_OPENTIMS #include #endif @@ -697,13 +699,17 @@ class TOPPCometAdapter : const bool is_bruker_d = (FileHandler::getType(inputfile_name) == FileTypes::BRUKER_TDF); if (is_bruker_d) { - // Load .d via BrukerTimsFile + // Load .d via BrukerTimsFile. Skip MS1 loading for MS2 searches since + // Comet only searches MS2 — cuts load time substantially. auto bruker_config = getBrukerConfig_(); + if (ms_level == 2) + bruker_config.load_ms1 = false; BrukerTimsFile tims_file; tims_file.setLogType(log_type_); tims_file.load(inputfile_name, exp, bruker_config); - // Filter to target MS level only (Comet only needs MS2) + // Filter to target MS level only (Comet only needs MS2). Redundant when + // load_ms1=false for MS2 searches but kept for safety / MS1 searches. std::erase_if(exp.getSpectra(), [&](const MSSpectrum& s) { return s.getMSLevel() != static_cast(ms_level); }); OPENMS_LOG_INFO << "Loaded " << exp.size() << " MS" << ms_level << " spectra from Bruker .d directory." << std::endl; @@ -714,6 +720,27 @@ class TOPPCometAdapter : writeDebug_("Last native ID from .d: " + exp[exp.size() - 1].getNativeID(), 2); } + // Rewrite native IDs to "index=N" (monotonic) before the temp-mzML write. + // + // Bruker .d DDA native IDs are of the form "frame=F scan=S precursor=P". + // Comet's bundled mzParser (MSToolkit/saxmzmlhandler.cpp) greps for "scan=" + // anywhere in the id and uses atoi of the following digits as its sort + // key. Because `scan=S` here is the TIMS isolation-window start (not a + // monotonic counter), mzParser detects "unsorted" scans and falls into + // a std::sort with a strict-weak-ordering-violating comparator + // (cindex::compare returns true for equal values) → undefined behavior + // → segfault in std::string shuffling. + // + // Workaround: rewrite native IDs to `index=N` (counter path, no atoi + // on scan=) before handing the mzML to Comet. Preserve the original + // native ID on each spectrum so post-Comet PSMs can be translated back. + size_t idx = 0; + for (auto& spec : exp.getSpectra()) + { + spec.setMetaValue("original_native_id", spec.getNativeID()); + spec.setNativeID("index=" + String(idx++)); + } + // Write to temporary indexed mzML for Comet auto tmp_mzml = File::getTemporaryFile() + ".mzML"; MzMLFile().store(tmp_mzml, exp); @@ -820,9 +847,32 @@ class TOPPCometAdapter : // Parse FAIMS compensation voltage if present SpectrumMetaDataLookup::addMissingFAIMSToPeptideIDs(peptide_identifications, exp); +#ifdef WITH_OPENTIMS + // Translate PSM spectrum references back from "index=N" (the rewritten form + // we handed to Comet to work around the mzParser sort bug) to the original + // Bruker native ID ("frame=F scan=S precursor=P"). Only the Bruker branch + // set original_native_id; the mzML branch has no-op for this loop. + if (is_bruker_d && !exp.empty() && exp[0].metaValueExists("original_native_id")) + { + // Build rewritten-id → original-id map once (O(N) vs O(N*M) linear scan). + std::unordered_map id_map; + id_map.reserve(exp.size()); + for (const auto& spec : exp.getSpectra()) + { + if (spec.metaValueExists("original_native_id")) + id_map.emplace(spec.getNativeID(), spec.getMetaValue("original_native_id").toString()); + } + for (auto& pid : peptide_identifications) + { + auto it = id_map.find(pid.getSpectrumReference()); + if (it != id_map.end()) pid.setSpectrumReference(it->second); + } + } +#endif + // remove base_name meta value from peptide identifications for (auto& peptide_identification : peptide_identifications) - { + { peptide_identification.removeMetaValue("base_name"); } diff --git a/src/topp/FileConverter.cpp b/src/topp/FileConverter.cpp index 41f699c9ba7..bb49d77c3d8 100644 --- a/src/topp/FileConverter.cpp +++ b/src/topp/FileConverter.cpp @@ -184,6 +184,10 @@ class TOPPFileConverter : setMinFloat_("bruker:calibration_tolerance", 0.0); registerStringOption_("bruker:calibrate", "", "false", "Enable m/z recalibration (may fail on some datasets)", false, true); setValidStrings_("bruker:calibrate", {"true", "false"}); + registerStringOption_("bruker:load_ms1", "", "true", + "Load MS1 spectra. Disable for MS2-only workflows (peptide database search) " + "where MS1 surveys are not needed — substantially cuts memory and time. Affects all export modes.", false, true); + setValidStrings_("bruker:load_ms1", {"true", "false"}); registerStringOption_("bruker:export_mode", "", "auto", "Export mode: 'auto' detects DDA/DIA acquisition type, " "'spectrum' forces per-precursor MS2 spectra (DDA-style), 'frame' returns raw 4D frames without signal processing.", false, true); setValidStrings_("bruker:export_mode", {"auto", "spectrum", "frame"}); @@ -266,6 +270,7 @@ class TOPPFileConverter : BrukerTimsFile::Config c; c.calibration_tolerance = getDoubleOption_("bruker:calibration_tolerance"); c.calibrate = (getStringOption_("bruker:calibrate") == "true"); + c.load_ms1 = (getStringOption_("bruker:load_ms1") == "true"); String mode = getStringOption_("bruker:export_mode"); if (mode == "spectrum") c.export_mode = BrukerTimsFile::Config::SPECTRUM; else if (mode == "frame") c.export_mode = BrukerTimsFile::Config::FRAME; From 2e25d1c997cebcd6213d9e2aea1ce6028c69a207 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 06:12:25 +0200 Subject: [PATCH 15/20] docs: sync CHANGELOG with recent changes (2026-04-17) (#9165) Add entry for contrib directory converted to git submodule (#9155): developers and packagers building from source now need 'git submodule update --init' to obtain vendored third-party libraries. Co-authored-by: GitHub Copilot Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index 3258e2b2eb4..31837e24dbd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -55,6 +55,7 @@ Dependencies: - ENABLE_TDL CMake option (required for CWL/Common Workflow Language file support) now defaults to OFF; set -DENABLE_TDL=ON to re-enable (#9067) - Boost minimum version requirement raised from 1.78 to 1.81 (#9095) - opentims: CMake now searches for a system-installed opentims_cpp via FindOpentims.cmake before falling back to a FetchContent git clone; packagers and users with opentims already installed can set opentims_DIR or opentims_ROOT to skip the download (#9147, #9148) + - contrib: the contrib directory is now a git submodule; developers and packagers building from a git clone must run `git submodule update --init` (or clone with `--recurse-submodules`) to obtain the vendored third-party libraries (#9155) TOPP tools: Changes: From 7f6c21ce59505096737dc0b96485884a18eb73b2 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Fri, 17 Apr 2026 10:23:58 +0200 Subject: [PATCH 16/20] feat(ProSE): add -out_pin for Percolator input file output (#9166) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(ProSE): add -out_pin for Percolator input file output Registers -out_pin (per-input, like -out_idxml) that writes Percolator .pin/.tsv files via PercolatorInfile::store(). Works independently of -percolator_executable — users can produce .pin files for external rescoring without running Percolator from within ProSE. The search algorithm already populates the extra_features metavalue on ProteinIdentification::SearchParameters with the correct feature set (score, fragment_error_median_ppm, matched_prefix/suffix_ions_ fraction). This commit just wires it through the TOPP tool's CLI and output loop. Closes item 4 in #9108. Co-Authored-By: Claude Opus 4.6 (1M context) * fix(ProSE): mark -out_pin as advanced parameter Co-Authored-By: Claude Opus 4.6 (1M context) * fix(ProSE): address CodeRabbit review on -out_pin (#9166) Three bugs found by CodeRabbit, all in the .pin output block: 1. (Critical) min_charge parsed via String(sp.charges).toInt() on the full "2:5" string — throws ConversionError, silently preventing .pin output on every run. Fixed: split on ':' for both min and max. 2. (Major) Duplicate "score" column — hardcoded "score" + extra_features (which already contains "score") → malformed .pin header. Fixed: only insert "score" if not already present in extra_features. 3. (Minor) result.protein_ids.front() without empty guard. Fixed: check empty and log error before accessing. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- src/topp/ProSE.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/topp/ProSE.cpp b/src/topp/ProSE.cpp index bad739a0d09..d45db160f75 100644 --- a/src/topp/ProSE.cpp +++ b/src/topp/ProSE.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // only for std::shared_ptr declarations @@ -102,6 +103,9 @@ class ProSE : registerOutputFile_("out_merged", "", "", "Optional merged output file containing all PSMs pooled across input files with cross-file protein inference (BasicProteinInferenceAlgorithm) and optional picked-protein FDR (FDR:protein). Per-file outputs (-out_idxml / -out_qpx / -out_parquet) retain run-level information. Only useful with multiple -in files.", false); setValidFormats_("out_merged", ListUtils::create("idXML")); + registerOutputFileList_("out_pin", "", StringList(), "Output Percolator input (.pin/.tsv) file(s) for external rescoring. Must have the same number of entries as -in. Written independently of -percolator_executable (i.e. you can produce .pin files without running Percolator).", false, true); + setValidFormats_("out_pin", ListUtils::create("tsv")); + registerOutputDir_("out_mod_analysis_dir", "

", "", "Optional directory to write modification-analysis tables (delta-mass, PTM stats) when running in open-search mode. When set, per-file tables are written using each input file's basename and an additional aggregate table is written across all input files. Has no effect in closed-search mode.", false, true); registerInputFile_("percolator_executable", "", @@ -123,6 +127,7 @@ class ProSE : const StringList in_list = getStringList_("in"); const String database = getStringOption_("database"); const StringList out_idxml_list = getStringList_("out_idxml"); + const StringList out_pin_list = getStringList_("out_pin"); const String out_merged = getStringOption_("out_merged"); const String out_qpx_dir = getOutputDirOption("out_qpx"); const String out_parquet_dir = getOutputDirOption("out_parquet"); @@ -136,9 +141,9 @@ class ProSE : } // At least one output must be specified - if (out_idxml_list.empty() && out_qpx_dir.empty() && out_parquet_dir.empty()) + if (out_idxml_list.empty() && out_pin_list.empty() && out_qpx_dir.empty() && out_parquet_dir.empty()) { - OPENMS_LOG_ERROR << "No output specified. Provide at least one of -out_idxml, -out_qpx, or -out_parquet." << endl; + OPENMS_LOG_ERROR << "No output specified. Provide at least one of -out_idxml, -out_pin, -out_qpx, or -out_parquet." << endl; return ILLEGAL_PARAMETERS; } @@ -150,6 +155,14 @@ class ProSE : return ILLEGAL_PARAMETERS; } + // -out_pin count must match -in count + if (!out_pin_list.empty() && in_list.size() != out_pin_list.size()) + { + OPENMS_LOG_ERROR << "Number of output files (-out_pin, " << out_pin_list.size() + << ") must match number of input files (-in, " << in_list.size() << ")." << endl; + return ILLEGAL_PARAMETERS; + } + // Validate basenames are unique (for parquet directory output) if (!out_qpx_dir.empty() || !out_parquet_dir.empty()) { @@ -435,6 +448,51 @@ class ProSE : } } + // -- Percolator .pin output -- + if (!out_pin_list.empty()) + { + try + { + if (result.protein_ids.empty()) + { + OPENMS_LOG_ERROR << "Cannot write .pin for " << in_file + << ": no ProteinIdentification/search parameters available." << endl; + input_failed = true; + } + else + { + const auto& sp = result.protein_ids.front().getSearchParameters(); + StringList feature_set; + if (sp.metaValueExists("extra_features")) + { + feature_set = ListUtils::create(sp.getMetaValue("extra_features").toString()); + } + if (std::find(feature_set.begin(), feature_set.end(), "score") == feature_set.end()) + { + feature_set.insert(feature_set.begin(), "score"); + } + feature_set.push_back("Peptide"); + feature_set.push_back("Proteins"); + const String enz_str = sp.digestion_enzyme.getName(); + const auto colon = sp.charges.find(':'); + const int min_charge = (colon != String::npos) + ? String(sp.charges.substr(0, colon)).toInt() + : String(sp.charges).toInt(); + const int max_charge = (colon != String::npos) + ? String(sp.charges.substr(colon + 1)).toInt() + : min_charge; + PercolatorInfile::store(out_pin_list[i], result.peptide_ids, + feature_set, enz_str, min_charge, max_charge); + } + } + catch (const Exception::BaseException& e) + { + OPENMS_LOG_ERROR << "Failed to write .pin output for " << in_file + << " -> " << out_pin_list[i] << ": " << e.what() << endl; + input_failed = true; + } + } + // -- QPX directory output -- if (!out_qpx_dir.empty()) { From 2bef3b47ae7f4f813d469287a9ba0244a9f7c687 Mon Sep 17 00:00:00 2001 From: Timo Sachsenberg Date: Fri, 17 Apr 2026 10:51:07 +0200 Subject: [PATCH 17/20] fix(ProSE): honor ions:add_* parameters in scoring path (#9167) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ProSE declared ions:add_a/b/c/x/y/z_ions parameters in its constructor but updateMembers_() never read them. FragmentIndex received and used them correctly (via setParameters in prepareContext), but TheoreticalSpectrumGenerator in the scoring loop always used its defaults (b/y only) — making the index and scorer inconsistent. Fix: read the 6 ion toggles in updateMembers_() and forward them to TheoreticalSpectrumGenerator alongside the existing add_first_prefix_ion and add_metainfo settings. Users can now configure ETD-style searches with -Search:ions:add_c_ions true -Search:ions:add_z_ions true -Search:ions:add_b_ions false -Search:ions:add_y_ions false and have both the index and scoring honor the setting. Closes item 12 in #9108. Co-authored-by: Claude Opus 4.6 (1M context) --- .../include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h | 7 +++++++ .../source/ANALYSIS/ID/ProSEAlgorithm.cpp | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h b/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h index fcf2bd8e923..d1d741b6c39 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h @@ -436,6 +436,13 @@ class OPENMS_DLLAPI ProSEAlgorithm : Size report_top_hits_; + bool add_a_ions_{false}; + bool add_b_ions_{true}; + bool add_c_ions_{false}; + bool add_x_ions_{false}; + bool add_y_ions_{true}; + bool add_z_ions_{false}; + bool calibration_enabled_{false}; double calibration_subset_ratio_{0.1}; Size calibration_min_psms_{50}; diff --git a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp index 69a4f0c6c59..7fad9dca3a5 100644 --- a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp +++ b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp @@ -270,6 +270,13 @@ namespace OpenMS // Open search mode is automatically determined based on precursor tolerance in isOpenSearchMode_() + add_a_ions_ = param_.getValue("ions:add_a_ions").toBool(); + add_b_ions_ = param_.getValue("ions:add_b_ions").toBool(); + add_c_ions_ = param_.getValue("ions:add_c_ions").toBool(); + add_x_ions_ = param_.getValue("ions:add_x_ions").toBool(); + add_y_ions_ = param_.getValue("ions:add_y_ions").toBool(); + add_z_ions_ = param_.getValue("ions:add_z_ions").toBool(); + calibration_enabled_ = param_.getValue("calibration:enabled") == "true"; calibration_subset_ratio_ = param_.getValue("calibration:subset_ratio"); calibration_min_psms_ = param_.getValue("calibration:min_psms"); @@ -824,11 +831,19 @@ namespace OpenMS // call sites below read from this field instead of re-computing. last_mod_match_tolerance_used_ = computeModMatchTolerance_(); - // create spectrum generator + // create spectrum generator — forward the ion-series toggles so scoring + // uses the same ion types as the FragmentIndex (which already reads them + // via setParameters in prepareContext). TheoreticalSpectrumGenerator spectrum_generator; Param param(spectrum_generator.getParameters()); param.setValue("add_first_prefix_ion", "true"); param.setValue("add_metainfo", "true"); + param.setValue("add_a_ions", add_a_ions_ ? "true" : "false"); + param.setValue("add_b_ions", add_b_ions_ ? "true" : "false"); + param.setValue("add_c_ions", add_c_ions_ ? "true" : "false"); + param.setValue("add_x_ions", add_x_ions_ ? "true" : "false"); + param.setValue("add_y_ions", add_y_ions_ ? "true" : "false"); + param.setValue("add_z_ions", add_z_ions_ ? "true" : "false"); spectrum_generator.setParameters(param); // preallocate storage for PSMs From 69f08c327c59960869b44eb5fdf3a78cc6d619cd Mon Sep 17 00:00:00 2001 From: Alen Saric Date: Fri, 17 Apr 2026 15:57:26 +0200 Subject: [PATCH 18/20] Implemented a checkup for the cleavge points and stops. It should only be A-Z, if an unknown character is recorded, it throws an Exception. --- .../source/CHEMISTRY/DigestionEnzyme.cpp | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp index 1ea653c265a..f6f94bd126f 100644 --- a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp +++ b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -45,9 +45,7 @@ namespace OpenMS name_(name), synonyms_(synonyms), regex_description_(std::move(regex_description)) - { - //TODO check if all letters are A-Z? - if (cut_before.empty()) + { if (cut_before.empty()) { //Maybe assertion? throw Exception::MissingInformation( @@ -56,7 +54,34 @@ namespace OpenMS OPENMS_PRETTY_FUNCTION, "No cleavage position given when trying to construct a DigestionEnzyme."); } - else if (!cut_before.hasSuffix("X")) + // We now know that cut_before is not empty. + // Check if every character given in the Positive Lookbehind Sequence is an A-Z, throw Exception otherwise. + for(char c: cut_before) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, + __LINE__, + OPENMS_PRETTY_FUNCTION, + "Amino Acids to get used for cleavage contains unknown character - something else than A-Z: " + c + "." + ); + } + } + // We also want to verifiy it for the AAs for the Negative Lookahead + for(char c: nocut_after) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, + __LINE__, + OPENMS_PRETTY_FUNCTION, + "Amino Acids to stop cleavages contains unknown character - something else than A-Z: " + c + "." + ); + } + } + if (!cut_before.hasSuffix("X")) { //TODO think about this cut_before = cut_before + "X"; From 5cca9e9ababb24493db65af171e083cbd17bcf4d Mon Sep 17 00:00:00 2001 From: Alen Saric Date: Wed, 22 Apr 2026 14:29:21 +0200 Subject: [PATCH 19/20] Shifted the Constructor of DigestionEnzyme. Refactored into DigestionEnzymeProtein. Created also Tests and whatnot. --- .../OpenMS/CHEMISTRY/DigestionEnzyme.h | 23 ++-- .../OpenMS/CHEMISTRY/DigestionEnzymeProtein.h | 12 +- .../source/CHEMISTRY/DigestionEnzyme.cpp | 78 ------------ .../CHEMISTRY/DigestionEnzymeProtein.cpp | 73 ++++++++++- .../source/DigestionEnzymeProtein_test.cpp | 34 ++++- .../openms/source/DigestionEnzyme_test.cpp | 116 ++++++++++++++++++ 6 files changed, 237 insertions(+), 99 deletions(-) create mode 100644 src/tests/class_tests/openms/source/DigestionEnzyme_test.cpp diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h index d9f95631555..84d5f46a9f7 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h @@ -25,13 +25,15 @@ namespace OpenMS @brief Base class for digestion enzymes */ - class OPENMS_DLLAPI DigestionEnzyme - { - public: + class OPENMS_DLLAPI DigestionEnzyme + { + public: - /** @name Constructors + /** @name Constructors */ - //@{ + //@{ + /// default constructor + DigestionEnzyme(); /// Copy constructor DigestionEnzyme(const DigestionEnzyme&) = default; @@ -44,14 +46,6 @@ namespace OpenMS const std::set& synonyms = std::set(), String regex_description = ""); - /// Detailed constructor 2 - explicit DigestionEnzyme(const String& name, - String cut_before, - const String& nocut_after = "", - String sense = "C", - const std::set& synonyms = std::set(), - String regex_description = ""); - /// Destructor virtual ~DigestionEnzyme(); //@} @@ -128,8 +122,6 @@ namespace OpenMS protected: - /// default constructor - DigestionEnzyme(); // basic String name_; @@ -164,4 +156,3 @@ namespace std } }; } // namespace std - diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h index 394100355fe..9c11df58df2 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h @@ -23,7 +23,7 @@ namespace OpenMS public DigestionEnzyme { public: - + enum Sense {C_TERM,N_TERM}; /** @name Constructors */ //@{ @@ -53,6 +53,13 @@ namespace OpenMS Int msgf_id = -1, Int omssa_id = -1); + explicit DigestionEnzymeProtein(const String& name, + String cut_before, + Sense sense, + const String& nocut_after = "", + const std::set& synonyms = std::set(), + String regex_description = ""); + /// Destructor ~DigestionEnzymeProtein() override; //@} @@ -159,10 +166,11 @@ namespace OpenMS Int omssa_id_; + String buildRegex_(String& cut_before, const String& nocut_after,const DigestionEnzymeProtein::Sense& sense); }; + OPENMS_DLLAPI std::ostream& operator<<(std::ostream& os, const DigestionEnzymeProtein& enzyme); typedef DigestionEnzymeProtein Protease; } - diff --git a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp index f6f94bd126f..25ac12af45d 100644 --- a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp +++ b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp @@ -36,83 +36,6 @@ namespace OpenMS { } - DigestionEnzyme::DigestionEnzyme(const String& name, - String cut_before, - const String& nocut_after, - String sense, - const std::set& synonyms, - String regex_description) : - name_(name), - synonyms_(synonyms), - regex_description_(std::move(regex_description)) - { if (cut_before.empty()) - { - //Maybe assertion? - throw Exception::MissingInformation( - __FILE__, - __LINE__, - OPENMS_PRETTY_FUNCTION, - "No cleavage position given when trying to construct a DigestionEnzyme."); - } - // We now know that cut_before is not empty. - // Check if every character given in the Positive Lookbehind Sequence is an A-Z, throw Exception otherwise. - for(char c: cut_before) - { - if (c > 'Z' || c < 'A') - { - throw Exception::InvalidParameter( - __FILE__, - __LINE__, - OPENMS_PRETTY_FUNCTION, - "Amino Acids to get used for cleavage contains unknown character - something else than A-Z: " + c + "." - ); - } - } - // We also want to verifiy it for the AAs for the Negative Lookahead - for(char c: nocut_after) - { - if (c > 'Z' || c < 'A') - { - throw Exception::InvalidParameter( - __FILE__, - __LINE__, - OPENMS_PRETTY_FUNCTION, - "Amino Acids to stop cleavages contains unknown character - something else than A-Z: " + c + "." - ); - } - } - if (!cut_before.hasSuffix("X")) - { - //TODO think about this - cut_before = cut_before + "X"; - } - cleavage_regex_ = ""; - if (sense.toLower() == "c") - { - cleavage_regex_ += "(?<=[" + cut_before + "]"; - if (!nocut_after.empty()) - { - cleavage_regex_ += "(?!" + nocut_after + "])"; - } - } - else if (sense.toLower() == "n") - { - if (!nocut_after.empty()) - { - cleavage_regex_ += "(?& synonyms = std::set(), + String regex_description = ""): + DigestionEnzyme(name, buildRegex_(cut_before, nocut_after, sense), synonyms, std::move(regex_description)) + { + } DigestionEnzymeProtein::~DigestionEnzymeProtein() = default; @@ -210,6 +219,67 @@ namespace OpenMS return false; } + String DigestionEnzymeProtein::buildRegex_(String& cut_before, const String& nocut_after, const DigestionEnzymeProtein::Sense& sense) + { + if (cut_before.empty()) + { + throw Exception::MissingInformation( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "No cleavage position given when trying to construct a DigestionEnzyme."); + } + + for(char c : cut_before) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "Amino Acids for cleavage contain unknown character: " + String(c)); + } + } + + for(char c : nocut_after) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "Amino Acids to stop cleavage contain unknown character: " + String(c)); + } + } + + if (!cut_before.hasSuffix("X")) + { + cut_before += "X"; + } + + String result = ""; + if (sense == DigestionEnzymeProtein::Sense::C_TERM) + { + result = "(?<=[" + cut_before + "])"; + if (!nocut_after.empty()) + { + result += "(?!" + nocut_after + "])"; + } + } + else if (sense == DigestionEnzymeProtein::Sense::N_TERM) + { + if (!nocut_after.empty()) + { + result = "(?(enzyme) << " " @@ -218,4 +288,3 @@ namespace OpenMS } } - diff --git a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp index 46e987b5015..a38d79b6df0 100644 --- a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp +++ b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp @@ -275,5 +275,37 @@ END_SECTION delete e_ptr; -END_TEST +START_SECTION((DigestionEnzymeProtein(const String& name, String cut_before, Sense sense, const String& nocut_after, const std::set& synonyms, String regex_description))) +{ + DigestionEnzymeProtein trypsin_style("TrypsinStyle", "K", DigestionEnzymeProtein::Sense::C_TERM, "P"); + TEST_EQUAL(trypsin_style.getRegEx(), "(?<=[KX])(?!P])") + + DigestionEnzymeProtein arg_c("Arg-C_Style", "R", DigestionEnzymeProtein::Sense::C_TERM); + TEST_EQUAL(arg_c.getRegEx(), "(?<=[RX])") + + DigestionEnzymeProtein n_term_test("N-Term_Style", "D", DigestionEnzymeProtein::Sense::N_TERM, "E"); + TEST_EQUAL(n_term_test.getRegEx(), "(? +#include + +////////////////////////////////////////// + +#include +#include + +using namespace OpenMS; +using namespace std; + +////////////////////////////////////////// + +START_TEST(DigestionEnzyme,"$ID") + +////////////////////////////////////////// + +DigestionEnzyme* e_ptr = nullptr; +DigestionEnzyme* e_null = nullptr; + + +START_SECTION((DigestionEnzyme())) + e_ptr = new DigestionEnzyme(); + TEST_NOT_EQUAL(e_ptr,e_null) +END_SECTION + +START_SECTION((virtual ~DigestionEnzyme())) + delete e_ptr; +END_SECTION + +START_SECTION(bool setValueFromFile(const String& key, const String& value)) + DigestionEnzyme enzyme; + + // Test the Name Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:Name","Trypsin"),true) + TEST_EQUAL(enzyme.getName(),"Tryp") + + // Test the RegEx Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:RegEx","Reg"),true) + TEST_EQUAL(enzyme.getRegEx(), "Reg") + + // Test the RegExDescription Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:RegExDescription","Desc"),true) + TEST_EQUAL(enzyme.getRegExDescription(),"Desc") + + // Test Synonym Setting + TEST_EQUAL(enzyme.setValueFromFile("syn:Synonyms","Trypsin"),true) + TEST_EQUAL(enzyme.setValueFromFile("test:Synonyms","TrypsinI"),true) + + // Since Synonyms are a set, test using set functions. + TEST_EQUAL(enzyme.getSynonyms().count("Trypsin"),1) + TEST_EQUAL(enzyme.getSynonyms().size(),2) + + // Test incorrect keys. + TEST_NOT_EQUAL(enzyme.setValueFromFile("test","Tryp-Like"),true) +END_SECTION + +START_SECTION(bool operator==(const String& cleavage_regex) const) + DigestionEnzyme enzyme; + enzyme.setRegEx("Verify"); + + TEST_EQUAL(enzyme == "Verify",true) + TEST_NOT_EQUAL(enzyme == "Accept",true) +END_SECTION + +START_SECTION(bool operator!=(const String& cleavage_regex) const) + DigestionEnzyme enzyme; + enzyme.setRegEx("Verify"); + + TEST_EQUAL(enzyme != "Accept",true) + TEST_NOT_EQUAL(enzyme != "Verfiy",true) +END_SECTION + +// < compares the names of the enzymes. +START_SECTION(bool operator<(const DigestionEnzyme& enzyme) const) + DigestionEnzyme e1,e2; + + e1.setName("A_Enzyme"); + e2.setName("B_ENZYME"); + + TEST_EQUAL(e1 < e2, true) + TEST_EQUAL(e2 < e1, false) + + // Safety test: when names are same, neither greater nor smaller, whatever the regex. + DigestionEnzyme e3; + e3.setName("A_Enzyme"); + e3.setRegEx("Greater"); + + TEST_NOT_EQUAL(e1 < e3, true) + TEST_NOT_EQUAL(e3 < e1, true) +END_SECTION + +START_SECTION(std::ostream& operator<<(std::ostream& os, const DigestionEnzyme& enzyme)) + DigestionEnzyme enzyme; + enzyme.setName("TestEnzyme"); + enzyme.setRegEx("[K]"); + enzyme.setRegExDescription("cuts at K"); + + stringstream ss; + ss << enzyme; + String output = ss.str(); + + TEST_EQUAL(output.hasSubstring("digestion enzyme: TestEnzyme"),true) + TEST_EQUAL(output.hasSubstring("(cleavage: [K] - cuts at K)"),true) +END_SECTION + +END_TEST From 892038b7a9e797729f8205cce42beefa75c7500e Mon Sep 17 00:00:00 2001 From: Alen Saric Date: Sat, 23 May 2026 13:31:43 +0200 Subject: [PATCH 20/20] =?UTF-8?q?Letzter=20Stand=20der=20Dinge=20-=20Base?= =?UTF-8?q?=20Class=20Test=20geschrieben=20und=20zuzsammenh=C3=A4ngende=20?= =?UTF-8?q?Files=20ge=C3=A4ndert.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- share/OpenMS/CHEMISTRY/unimod.xml | 6 ++--- .../OpenMS/CHEMISTRY/DigestionEnzyme.h | 2 +- .../OpenMS/CHEMISTRY/DigestionEnzymeProtein.h | 4 +-- .../CHEMISTRY/DigestionEnzymeProtein.cpp | 6 ++--- src/openms/source/FORMAT/PepXMLFile.cpp | 25 ++++++++++--------- .../class_tests/openms/executables.cmake | 3 ++- .../source/DigestionEnzymeProtein_test.cpp | 2 +- .../openms/source/DigestionEnzyme_test.cpp | 10 ++++---- 8 files changed, 30 insertions(+), 28 deletions(-) diff --git a/share/OpenMS/CHEMISTRY/unimod.xml b/share/OpenMS/CHEMISTRY/unimod.xml index 27c7c09e2c1..35b8b4d2dc1 100644 --- a/share/OpenMS/CHEMISTRY/unimod.xml +++ b/share/OpenMS/CHEMISTRY/unimod.xml @@ -23139,7 +23139,7 @@ Szychowski J, Mahdavi A, Hodas JJ, Bagert JD, Ngo JT, Landgraf P, Dieterich DC, Trypsin Digestion reminant of Ub Br probe Chemistry and Biology -Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 +Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 Chemistry-based functional proteomics reveals novel members of the deubiquitinating enzyme family Journal http://www.cell.com/chemistry-biology/abstract/S1074-5521%2802%2900248-X @@ -23174,7 +23174,7 @@ Chemistry-based functional proteomics reveals novel members of the deubiquitinat Chemistry and Biology -Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 +Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 Chemistry-based functional proteomics reveals novel members of the deubiquitinating enzyme family Journal http://www.cell.com/chemistry-biology/abstract/S1074-5521%2802%2900248-X @@ -46109,4 +46109,4 @@ antiserum and mass spectrometric analysis - \ No newline at end of file + diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h index 84d5f46a9f7..9f7fde5dd2d 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h index 9c11df58df2..036dc9204c5 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -60,7 +60,7 @@ namespace OpenMS const std::set& synonyms = std::set(), String regex_description = ""); - /// Destructor + /// Destructor ~DigestionEnzymeProtein() override; //@} diff --git a/src/openms/source/CHEMISTRY/DigestionEnzymeProtein.cpp b/src/openms/source/CHEMISTRY/DigestionEnzymeProtein.cpp index a6772fb3c98..3c5140e5521 100644 --- a/src/openms/source/CHEMISTRY/DigestionEnzymeProtein.cpp +++ b/src/openms/source/CHEMISTRY/DigestionEnzymeProtein.cpp @@ -63,9 +63,9 @@ namespace OpenMS DigestionEnzymeProtein::DigestionEnzymeProtein(const String& name, String cut_before, Sense sense, - const String& nocut_after = "", - const std::set& synonyms = std::set(), - String regex_description = ""): + const String& nocut_after, + const std::set& synonyms, + String regex_description): DigestionEnzyme(name, buildRegex_(cut_before, nocut_after, sense), synonyms, std::move(regex_description)) { } diff --git a/src/openms/source/FORMAT/PepXMLFile.cpp b/src/openms/source/FORMAT/PepXMLFile.cpp index d2e1da05995..23e7aca82b7 100644 --- a/src/openms/source/FORMAT/PepXMLFile.cpp +++ b/src/openms/source/FORMAT/PepXMLFile.cpp @@ -1254,7 +1254,7 @@ namespace OpenMS value = attributeAsDouble_(attributes, "value"); peptide_hit_.setMetaValue("Comet:lnrSp", value); // name: Comet:lnrSp peptide_hit_.setMetaValue("COMET:lnRankSP", value); // name: COMET:lnRankSP - } + } else if (name == "deltLCn") { value = attributeAsDouble_(attributes, "value"); @@ -1263,7 +1263,7 @@ namespace OpenMS else if (name == "lnExpect") { value = attributeAsDouble_(attributes, "value"); - peptide_hit_.setMetaValue("COMET:lnExpect", value); // name: Comet:lnExpect + peptide_hit_.setMetaValue("COMET:lnExpect", value); // name: Comet:lnExpect } else if (name == "IonFrac") { @@ -1275,7 +1275,7 @@ namespace OpenMS { value = attributeAsDouble_(attributes, "value"); peptide_hit_.setMetaValue("COMET:lnNumSP", value); // name: Comet:lnNumSP - } + } } else if (parse_unknown_scores_) { @@ -1295,7 +1295,7 @@ namespace OpenMS //TODO warn about non-numeric score? Or even do not catch the conversion error? peptide_hit_.setMetaValue(name, attributeAsString_(attributes, "value")); // Any other generic score (fallback String) } - + } } } @@ -1350,7 +1350,7 @@ namespace OpenMS { bool current_prot_is_decoy = protein.hasPrefix(decoy_prefix_); auto current_type = peptide_hit_.getTargetDecoyType(); - + if (current_type == PeptideHit::TargetDecoyType::UNKNOWN) { // No annotation yet, set based on current protein @@ -1403,8 +1403,8 @@ namespace OpenMS current_peptide_.setSpectrumReference( String("scan=") + String(scannr_)); } //TODO else error? - - + + if (!experiment_label_.empty()) { current_peptide_.setExperimentLabel(experiment_label_); @@ -1650,7 +1650,7 @@ namespace OpenMS { bool current_prot_is_decoy = protein.hasPrefix(decoy_prefix_); auto current_type = peptide_hit_.getTargetDecoyType(); - + if (current_type == PeptideHit::TargetDecoyType::UNKNOWN) { // No annotation yet, set based on current protein @@ -1664,7 +1664,7 @@ namespace OpenMS // Peptide matches both target and decoy proteins peptide_hit_.setTargetDecoyType(PeptideHit::TargetDecoyType::TARGET_DECOY); } - + hit.setTargetDecoyType(current_prot_is_decoy ? ProteinHit::TargetDecoyType::DECOY : ProteinHit::TargetDecoyType::TARGET); @@ -1926,16 +1926,17 @@ namespace OpenMS String cut_before = attributeAsString_(attributes, "cut"); String no_cut_after = attributeAsString_(attributes, "no_cut"); String sense = attributeAsString_(attributes, "sense"); - params_.digestion_enzyme = DigestionEnzymeProtein(DigestionEnzyme( + DigestionEnzymeProtein::Sense sen = (sense.toLower() == "c") ? sen = DigestionEnzymeProtein::Sense::C_TERM : sen = DigestionEnzymeProtein::Sense::N_TERM; + params_.digestion_enzyme = DigestionEnzymeProtein( "user-defined," + enzyme_ + "," + cut_before + "," + no_cut_after + "," + sense, - cut_before, no_cut_after, sense)); + cut_before, sen, no_cut_after); } else if (element == "enzymatic_search_constraint") // parent: "search_summary" { //TODO we should not overwrite the enzyme here! Luckily in most files it is the same // enzyme as in sample_enzyme or something useless like "default". /// - enzyme_ = attributeAsString_(attributes, "enzyme"); + enzyme_ = attributeAsString_(attributes, "enzyme"); if (enzyme_ == "stricttrypsin") enzyme_ = "Trypsin/P"; // MSFragger synonyme if (ProteaseDB::getInstance()->hasEnzyme(enzyme_)) diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index dfb10a50cb5..721a42c2d52 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -410,6 +410,7 @@ set(chemistry_executables_list CoarseIsotopeDistribution_test CrossLinksDB_test DecoyGenerator_test + DigestionEnzyme_test DigestionEnzymeProtein_test ElementDB_test Element_test @@ -599,7 +600,7 @@ set(transformations_executables_list EmgFitter1D_test EmgModel_test ExtendedIsotopeFitter1D_test - ExtendedIsotopeModel_test + ExtendedIsotopeModel_test FeatureFinderAlgorithmPickedHelperStructs_test FeatureFinderAlgorithmPicked_test FeatureFinderIdentificationAlgorithm_test diff --git a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp index a38d79b6df0..90c057e6426 100644 --- a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp +++ b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // diff --git a/src/tests/class_tests/openms/source/DigestionEnzyme_test.cpp b/src/tests/class_tests/openms/source/DigestionEnzyme_test.cpp index cb2880a92c3..d9c91f0f6a6 100644 --- a/src/tests/class_tests/openms/source/DigestionEnzyme_test.cpp +++ b/src/tests/class_tests/openms/source/DigestionEnzyme_test.cpp @@ -42,7 +42,7 @@ START_SECTION(bool setValueFromFile(const String& key, const String& value)) // Test the Name Setting. TEST_EQUAL(enzyme.setValueFromFile("test:Name","Trypsin"),true) - TEST_EQUAL(enzyme.getName(),"Tryp") + TEST_EQUAL(enzyme.getName(),"Trypsin") // Test the RegEx Setting. TEST_EQUAL(enzyme.setValueFromFile("test:RegEx","Reg"),true) @@ -53,8 +53,8 @@ START_SECTION(bool setValueFromFile(const String& key, const String& value)) TEST_EQUAL(enzyme.getRegExDescription(),"Desc") // Test Synonym Setting - TEST_EQUAL(enzyme.setValueFromFile("syn:Synonyms","Trypsin"),true) - TEST_EQUAL(enzyme.setValueFromFile("test:Synonyms","TrypsinI"),true) + TEST_EQUAL(enzyme.setValueFromFile("syn:Synonyms:","Trypsin"),true) + TEST_EQUAL(enzyme.setValueFromFile("test:Synonyms:","TrypsinI"),true) // Since Synonyms are a set, test using set functions. TEST_EQUAL(enzyme.getSynonyms().count("Trypsin"),1) @@ -77,7 +77,7 @@ START_SECTION(bool operator!=(const String& cleavage_regex) const) enzyme.setRegEx("Verify"); TEST_EQUAL(enzyme != "Accept",true) - TEST_NOT_EQUAL(enzyme != "Verfiy",true) + TEST_NOT_EQUAL(enzyme != "Verify",true) END_SECTION // < compares the names of the enzymes. @@ -109,7 +109,7 @@ START_SECTION(std::ostream& operator<<(std::ostream& os, const DigestionEnzyme& ss << enzyme; String output = ss.str(); - TEST_EQUAL(output.hasSubstring("digestion enzyme: TestEnzyme"),true) + TEST_EQUAL(output.hasSubstring("digestion enzyme:TestEnzyme"),true) TEST_EQUAL(output.hasSubstring("(cleavage: [K] - cuts at K)"),true) END_SECTION