diff --git a/.github/workflows/openms_ci_matrix_full.yml b/.github/workflows/openms_ci_matrix_full.yml index 73afde4ee31..06db1e2541f 100644 --- a/.github/workflows/openms_ci_matrix_full.yml +++ b/.github/workflows/openms_ci_matrix_full.yml @@ -38,7 +38,60 @@ on: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: + # NOTE: This job is temporary until we get rid of contrib. + prepare-contrib-for-windows: + runs-on: windows-2025 + steps: + - name: Clone OpenMS + uses: actions/checkout@v6 + - name: Set up Visual Studio shell + uses: egor-tensin/vs-shell@v2 + with: { arch: x64 } + + - name: Identify contrib submodule revision for cache key + id: contrib-rev + shell: bash + run: | + rev=$(git submodule status --cached -- contrib | sed -E 's/^-?//' | cut -d' ' -f1) + echo "rev=$rev" >> "$GITHUB_OUTPUT" + + - name: Cache contrib + id: cache-contrib + uses: actions/cache@v5 + with: + path: contrib/build + key: ${{ runner.os }}-${{ steps.contrib-rev.outputs.rev }} + + - name: Fetch contrib if not in cache + if: steps.cache-contrib.outputs.cache-hit != 'true' + shell: bash + run: | + git submodule update --init -- contrib + mkdir -p contrib/build + + - name: Build contrib if not in cache + if: steps.cache-contrib.outputs.cache-hit != 'true' + shell: pwsh + run: | + $NPROC = $env:NUMBER_OF_PROCESSORS + if (-not $NPROC) { $NPROC = 4 } + $env:CMAKE_BUILD_PARALLEL_LEVEL = $NPROC + Set-Location "${{ github.workspace }}\contrib\build" + cmake -G"Visual Studio 17 2022" -Ax64 -DBUILD_TYPE=ALL "${{ github.workspace }}\contrib" + if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE } + + - name: Upload Contrib + uses: actions/upload-artifact@v7 + with: + name: contrib-${{ steps.contrib-rev.outputs.rev }} + path: | + contrib/build/lib/ + contrib/build/bin/ + contrib/build/include/ + retention-days: 7 + build-and-test: + needs: prepare-contrib-for-windows strategy: fail-fast: false matrix: @@ -326,28 +379,6 @@ jobs: echo "cmake_prefix=$(brew --prefix);$(brew --prefix libomp)" >>"$GITHUB_OUTPUT" fi - - name: Cache contrib (Windows) - if: startsWith(matrix.os, 'windows') - id: cache-contrib - uses: actions/cache@v4 - with: - path: ${{ github.workspace }}/OpenMS/contrib - key: ${{ runner.os }}-contrib4 - - - name: Download contrib build from archive (Windows) - if: startsWith(matrix.os, 'windows') && steps.cache-contrib.outputs.cache-hit != 'true' - shell: bash - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - cd OpenMS/contrib - # Download the file using the URL fetched from GitHub - gh release download -R OpenMS/contrib --pattern 'contrib_build-Windows.tar.gz' - # Extract the archive - 7z x -so contrib_build-Windows.tar.gz | 7z x -si -ttar - rm contrib_build-Windows.tar.gz - ls -la - - name: Build libcurl from source (Windows) if: startsWith(matrix.os, 'windows') shell: bash @@ -378,6 +409,21 @@ jobs: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.compiler }}-${{ matrix.compiler_ver }}-ccache-develop ${{ runner.os }}-${{ runner.arch }}-${{ matrix.compiler }}-${{ matrix.compiler_ver }}-ccache- + - name: Identify contrib submodule revision for cache key + id: contrib-rev + shell: bash + run: | + cd OpenMS + rev=$(git submodule status --cached -- contrib | sed -E 's/^-?//' | cut -d' ' -f1) + echo "rev=$rev" >> "$GITHUB_OUTPUT" + + - name: Download contrib artifact for Windows + if: runner.os == 'Windows' + uses: actions/download-artifact@v4 + with: + name: contrib-${{ steps.contrib-rev.outputs.rev }} + path: OpenMS/contrib/build + - name: Add THIRDPARTY shell: bash run: | @@ -453,7 +499,7 @@ jobs: PYOPENMS: "${{ startsWith(matrix.os, 'ubuntu') && matrix.compiler == 'g++' && 'ON' || 'OFF' }}" NO_DEPENDENCIES: "${{ startsWith(matrix.os, 'ubuntu') && matrix.compiler == 'g++' && 'OFF' || 'ON' }}" CMAKE_PREFIX_PATH: "${{ steps.tools-prefix.outputs.cmake_prefix }};${{ steps.conda-setup.outputs.cmake_prefix }}" - OPENMS_CONTRIB_LIBS: "${{ github.workspace }}/OpenMS/contrib" + OPENMS_CONTRIB_LIBS: "${{ github.workspace }}/OpenMS/contrib/build" CI_PROVIDER: "GitHub-Actions" CMAKE_GENERATOR: "Ninja" SOURCE_DIRECTORY: "${{ github.workspace }}/OpenMS" diff --git a/AUTHORS b/AUTHORS index 87cb713078a..7ed43f41be1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ the authors tag in the respective file header. - Achal Bajpai - Aditya Muzumdar - Ahmed Khalil + - Alen Saric - Alexandra Scherbart - Alexandra Zerck - Amanda Wein @@ -73,6 +74,7 @@ the authors tag in the respective file header. - Leon Bichmann - Leon Kuchenbecker - Lucia Espona + - Luis Jacob Keller - Lukas Mueller - Lukas Zimmermann - Marcel Schilling diff --git a/CHANGELOG b/CHANGELOG index 22cb7ea9506..31837e24dbd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,6 +33,10 @@ General: - Fix: TheoreticalSpectrumGenerator: corrected m/z calculation for b/a neutral-loss fragment ions; the neutral-loss loop previously omitted the first residue mass, causing all b/a neutral-loss fragments to report masses that were too low by the first residue's internal mass (#9078, #9084) - Fix: LinearResampler: right_index was clamped to the number of raw data points instead of the number of resampled points; when resampled points far outnumber raw points (e.g. fine spacing over a wide m/z range), this caused extreme intensity spikes; bug introduced in 2008 and masked by LinearResamplerAlign which has an independent correct implementation (#9127) - Transparent .zip decompression for XML-based formats (mzML, idXML, etc.) and Bruker .d directories: files with a .zip extension are now decompressed on-the-fly during parsing; .d.zip archives are extracted to a temporary directory and loaded via BrukerTimsFile; no conversion step required (#9139) + - Fix: BrukerTimsFile: DDA-PASEF and DIA-PASEF spectra are now marked as CENTROID (PeakType::PEAK) so downstream tools that check spectrum type (e.g. FeatureFinderMultiplex, OpenSwathWorkflow) accept them without extra conversion (#9154) + - Bruker TimsTOF MS1 frame aggregation for DIA-PASEF and DDA-PASEF: new BrukerTimsFile::Config fields (ms1_n_neighbors, ms1_min_support, ms1_max_rt_distance_sec, ms1_centroid_max_peaks) sum peaks from 2N+1 adjacent MS1 frames into a sparse (mz_bin, scan_id) grid per center frame, composable with the existing within-frame IM centroiding (ms1_centroid_mz_ppm and ms1_centroid_im_pct). Exposed through FileConverter as -bruker:ms1_n_neighbors, -bruker:ms1_min_support, -bruker:ms1_max_rt_distance_sec, -bruker:ms1_centroid_max_peaks. Disabled by default. Also renames the internal DIAFrameAggregator class to FrameAggregator (scope-agnostic now that MS1 uses it), guards its (mz_bin, scan_id) grid lookups against uint32 wrap at scan_id=0, and switches the skip_denoise predicate in the MS2 aggregation loop from index-span to actual contributing-frame count (closes #9151 TODO). + - BrukerTimsFile: DDA MS2 native IDs changed from "scan=" to "frame= scan= precursor=". The frame/scan pair follows the PSI-MS MS:1002818 ("Bruker TDF nativeID format") convention (scan is the TIMS isolation-window scan_begin); the trailing "precursor=

" token is required for uniqueness under OpenMS's per-Precursors.Id aggregation (different precursors can share the same (frame, scan_begin) via PASEF co-isolation with different IsolationMz, and multi-cycle precursors can collide on their first entry) — this mirrors the convention pwiz uses in its "combined" mode ("merged=N frame=F scanStart=A scanEnd=B"), where extra tokens are added while the SourceFile CV accession remains MS:1002818. The Bruker Precursors.Id is ALSO duplicated as a typed MetaValue "bruker_precursor_id" for direct programmatic access. SpectrumNativeIDParser now recognizes MS:1002818 and the "frame=" prefix so downstream scan-number extraction (SpectrumLookup, USI, PepXMLFile, PercolatorInfile, IDMapper) works for Bruker .d output. New BrukerTimsFile::Config field load_ms1 (CLI: -bruker:load_ms1, default true): disable to skip MS1 loading entirely for MS2-only workflows (peptide database search) — cuts memory and time substantially. CometAdapter automatically sets load_ms1=false when searching MS2. BrukerTimsFile::transform() progress now matches actually-emitted spectra in DIA mode (per-WindowGroup sum, not frames × all_windows) and respects load_ms1 across all modes. + - CometAdapter: workaround for a latent bug in Comet's bundled mzParser (MSToolkit/saxmzmlhandler.cpp) that segfaults when mzML files contain mixed "frame=..." and "scan=..." native IDs with overlapping numeric values — which is exactly the Bruker .d DDA/DIA case. CometAdapter rewrites Bruker .d native IDs to monotonic "index=N" before handing the temp mzML to Comet, and translates PSM spectrum references back to the original Bruker IDs after Comet finishes. Upstream fix for cindex::compare's strict-weak-ordering violation should be filed against UWPR/Comet. Dependencies: - PyOpenMS: Autowrap/Cython replaced with nanobind; nanobind 2.10.0+ fetched automatically (#8699) @@ -50,6 +54,8 @@ Dependencies: - boost-process added as a header-only dependency for cross-platform subprocess management (#8939) - ENABLE_TDL CMake option (required for CWL/Common Workflow Language file support) now defaults to OFF; set -DENABLE_TDL=ON to re-enable (#9067) - Boost minimum version requirement raised from 1.78 to 1.81 (#9095) + - opentims: CMake now searches for a system-installed opentims_cpp via FindOpentims.cmake before falling back to a FetchContent git clone; packagers and users with opentims already installed can set opentims_DIR or opentims_ROOT to skip the download (#9147, #9148) + - contrib: the contrib directory is now a git submodule; developers and packagers building from a git clone must run `git submodule update --init` (or clone with `--recurse-submodules`) to obtain the vendored third-party libraries (#9155) TOPP tools: Changes: @@ -111,6 +117,7 @@ TOPP tools: - Bruker DIA MS2 frame aggregation, denoising, and OpenSwath streaming: new BrukerTimsFile::Config fields (bruker:dia_ms2_n_neighbors, bruker:dia_ms2_min_support, bruker:dia_ms2_centroid) enable grid-based denoising and 2D peak picking of DIA MS2 frames; aggregated spectra can be streamed directly into the OpenSwath pipeline without prior mzML conversion; registered in FileConverter and PeakPickerIM; requires WITH_TIMSRUST=ON (#9129) - Fix: BrukerTimsFile::loadDIAStreaming now sorts each spectrum's peaks by m/z before feeding them to the OpenSwath consumer; unsorted peaks (caused by TIMS scan ordering) silently produced empty chromatograms, breaking iRT calibration on .d DIA input (#9135) - Fix: segfault when use_consensus=false; hull_points returned by PeakIntegrator::integratePeak varied in size per-transition, causing Eigen::Map to read past the end of the shorter vector in MRMScoring::initializeXCorrMatrix; fixed by padding hull_points to the full resampled chromatogram length in non-consensus mode (#9143) + - Allow Bruker DIA MS2 centroiding without denoising: bruker:dia_ms2_min_support can now be set to 0; with centroid=true and min_support=0 every populated grid cell is a centroid candidate, skipping the neighbor-support filter; useful for low-density data where the 3×3 denoising step discards real peaks (#9151) PyOpenMS: - BREAKING: Binding backend replaced from Autowrap/Cython to hand-maintained nanobind C++ bindings (#8699) @@ -291,6 +298,7 @@ OpenMS Library: - BREAKING: PeptideAndProteinQuant channel identifiers changed from Int to UInt (#8516) - BREAKING: Plain enums migrated to enum class (Sample::SampleState, OpenMS_OS, etc.) (#8622) - ILPDCWrapper: Performance improvement using std::unordered_map (~3.5x speedup) (#8618) + - DELETION: deleted Class LinearResampler and refactored functionality into LinearResamplerAlign (#9153) - OpenSwath: Use unordered_map for O(1) lookups, improving performance (#8651) - DataValue: Enhanced conversion error messages with type and value context (#8572) - BayesianProteinInferenceAlgorithm: Now requires single merged ProteinIdentification run (#8480) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 72ec8d41807..6655e135d0c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,14 +29,14 @@ If you are an official OpenMS team member: # Opening a Pull Request -Before getting started we recommend taking a look at our GitHub-Wiki: https://github.com/OpenMS/OpenMS/wiki#-for-developers +Before getting started we recommend taking a look at the developer guide: +https://openms.readthedocs.io/en/latest/manual/develop.html Before you open the pull request, make sure you - - adhere to [our coding conventions](https://github.com/OpenMS/OpenMS/wiki/Coding-conventions) - - have [unit tests and functional tests](https://github.com/OpenMS/OpenMS/wiki/Write-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) - - Have [proper documentation](https://github.com/OpenMS/OpenMS/wiki/Coding-conventions#doxygen) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) + - adhere to [our coding conventions](https://openms.readthedocs.io/en/latest/manual/develop.html#coding-conventions) + - have [unit tests and functional tests](https://openms.readthedocs.io/en/latest/manual/develop.html#automated-unit-tests) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/tests/class_tests/openms/source/MSNumpressCoder_test.cpp) + - have [proper documentation](https://openms.readthedocs.io/en/latest/manual/develop/developer-faq.html#doxygen-documentation) - see also [this example](https://github.com/OpenMS/OpenMS/blob/develop/src/openms/include/OpenMS/FORMAT/MSNumpressCoder.h) - have Python bindings — nanobind binding files are in `src/pyOpenMS/bindings/`; see `src/pyOpenMS/CLAUDE.md` for wrapping instructions - A core developer will review your changes to the main development branch (develop) and approve them (or ask for modifications). You may indicate the prefered reviewer(s) by adding links to them in a comment section (e.g., @cbielow @hendrikweisser @hroest @jpfeuffer @timosachsenberg) Also consider getting in contact with the core developers early. They might provide additional guidance and valuable information on how your specific aim is achieved. This might give you a head start in, for example, developing novel tools or algorithms. diff --git a/cmake/cmake_findExternalLibs.cmake b/cmake/cmake_findExternalLibs.cmake index 4c78428c41e..610170a88c2 100644 --- a/cmake/cmake_findExternalLibs.cmake +++ b/cmake/cmake_findExternalLibs.cmake @@ -376,7 +376,7 @@ if (WITH_OPENTIMS) # conservatively inject sqlite3 because we cannot know how it was built. get_target_property(_opentims_defs opentims::opentims_cpp INTERFACE_COMPILE_DEFINITIONS) if(_opentims_defs MATCHES "OPENTIMS_LINK_SQLITE_STATICALLY" - OR (_opentims_defs MATCHES "NOTFOUND" AND NOT opentims_FOUND)) + OR ((_opentims_defs MATCHES "NOTFOUND" OR _opentims_defs STREQUAL "") AND NOT opentims_FOUND)) target_include_directories(opentims::opentims_cpp INTERFACE "${CMAKE_SOURCE_DIR}/src/openms/extern/SQLiteCpp/sqlite3") target_link_libraries(opentims::opentims_cpp INTERFACE sqlite3) @@ -398,12 +398,31 @@ if (WITH_OPENTIMS) set(OPENTIMS_BUILD_LIB ON CACHE BOOL "" FORCE) set(OPENTIMS_BUILD_PYTHON OFF CACHE BOOL "" FORCE) set(OPENTIMS_LINK_SQLITE_STATICALLY ON CACHE BOOL "" FORCE) - FetchContent_MakeAvailable(opentims) - # Provide OpenMS's sqlite3 headers and library to opentims + # OpenMS sets BUILD_SHARED_LIBS=true (to build libOpenMS.so). Without this + # override, opentims's CMakeLists.txt would build libopentims_cpp.so instead + # of libopentims_cpp.a. A shared opentims would land in _deps/opentims-build/ + # rather than the system lib path, so the build-time linker would fail with + # "undefined reference to TimsDataHandle" when linking test binaries against + # libOpenMS.so (because libopentims_cpp.so's directory is not in the test's + # -L search path). Force static so all opentims symbols get absorbed into + # libOpenMS.so at link time. + set(_openms_saved_build_shared_libs ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF) + FetchContent_MakeAvailable(opentims) + set(BUILD_SHARED_LIBS ${_openms_saved_build_shared_libs}) + + # Provide OpenMS's sqlite3 headers to opentims so it compiles with + # OPENTIMS_LINK_SQLITE_STATICALLY (direct sqlite3 calls vs. dlopen). + # We intentionally do NOT call target_link_libraries(opentims_cpp PRIVATE sqlite3) + # here: adding the CMake sqlite3 target as a PRIVATE dep of a STATIC library + # would require sqlite3 to appear in the CMake export sets, which creates + # conflicts with SQLiteCpp's own sqlite3 export. The sqlite3 symbols are + # resolved at final link time through OpenMS's own SQLiteCpp dependency + # (SQLiteCpp PUBLIC-links sqlite3, so libsqlite3.a already appears in + # libOpenMS.so's link command after libopentims_cpp.a). target_include_directories(opentims_cpp PRIVATE "${CMAKE_SOURCE_DIR}/src/openms/extern/SQLiteCpp/sqlite3") - target_link_libraries(opentims_cpp PRIVATE sqlite3) # ZSTD: prefer system; fall back to opentims's bundled decoder. set(_OPENTIMS_SRC "${opentims_SOURCE_DIR}/src/opentims++") @@ -427,11 +446,12 @@ if (WITH_OPENTIMS) # opentims::opentims_cpp regardless of how the library was obtained. add_library(opentims::opentims_cpp ALIAS opentims_cpp) - # Add opentims_cpp to the OpenMSTargets export set (same as Evergreen, - # SQLiteCpp, and other bundled deps). CMake requires all targets linked - # by an exported target to be either imported or in an export set — even - # for PRIVATE deps of a shared library. + # Add opentims_cpp to both the install-tree and build-tree export sets + # (same pattern as Evergreen, IsoSpec, and other bundled deps). + # install_library() → OpenMSTargets install export + # openms_register_export_target() → _OPENMS_EXPORT_TARGETS build-tree export() install_library(opentims_cpp) + openms_register_export_target(opentims_cpp) message(STATUS "opentims: built from source (${opentims_SOURCE_DIR})") endif() diff --git a/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp b/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp index 27f57d990af..8b24e5827c2 100644 --- a/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp +++ b/doc/code_examples/Tutorial_SavitzkyGolayFilter.cpp @@ -3,7 +3,7 @@ // #include -#include +#include #include #include #include // exotic header for path to tutorial data @@ -21,7 +21,7 @@ int main(int argc, const char** argv) // Load the dta file into the spectrum FileHandler().loadSpectrum(file_dta, spectrum); - LinearResampler lr; + LinearResamplerAlign lr; Param param_lr; param_lr.setValue("spacing", 0.01); lr.setParameters(param_lr); diff --git a/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp b/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp index 2b6deda729b..899ce9614b2 100644 --- a/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp +++ b/doc/doxygen/parameters/DefaultParamHandlerDocumenter.cpp @@ -90,7 +90,6 @@ #include #include #include -#include #include #include #include @@ -390,7 +389,6 @@ int main(int argc, char** argv) DOCME(ItraqEightPlexQuantitationMethod); DOCME(ItraqFourPlexQuantitationMethod); DOCME(LabeledPairFinder); - DOCME(LinearResampler); DOCME(MSPFile); DOCME(MapAlignmentAlgorithmPoseClustering); DOCME(SpectrumAnnotator); diff --git a/share/OpenMS/CHEMISTRY/unimod.xml b/share/OpenMS/CHEMISTRY/unimod.xml index 27c7c09e2c1..35b8b4d2dc1 100644 --- a/share/OpenMS/CHEMISTRY/unimod.xml +++ b/share/OpenMS/CHEMISTRY/unimod.xml @@ -23139,7 +23139,7 @@ Szychowski J, Mahdavi A, Hodas JJ, Bagert JD, Ngo JT, Landgraf P, Dieterich DC, Trypsin Digestion reminant of Ub Br probe Chemistry and Biology -Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 +Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 Chemistry-based functional proteomics reveals novel members of the deubiquitinating enzyme family Journal http://www.cell.com/chemistry-biology/abstract/S1074-5521%2802%2900248-X @@ -23174,7 +23174,7 @@ Chemistry-based functional proteomics reveals novel members of the deubiquitinat Chemistry and Biology -Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 +Volume 9, Issue 10, 1 October 2002, Pages 1149-1159 Chemistry-based functional proteomics reveals novel members of the deubiquitinating enzyme family Journal http://www.cell.com/chemistry-biology/abstract/S1074-5521%2802%2900248-X @@ -46109,4 +46109,4 @@ antiserum and mass spectrometric analysis - \ No newline at end of file + diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h b/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h index fcf2bd8e923..d1d741b6c39 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ProSEAlgorithm.h @@ -436,6 +436,13 @@ class OPENMS_DLLAPI ProSEAlgorithm : Size report_top_hits_; + bool add_a_ions_{false}; + bool add_b_ions_{true}; + bool add_c_ions_{false}; + bool add_x_ions_{false}; + bool add_y_ions_{true}; + bool add_z_ions_{false}; + bool calibration_enabled_{false}; double calibration_subset_ratio_{0.1}; Size calibration_min_psms_{50}; diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h index d9f95631555..9f7fde5dd2d 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzyme.h @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -25,13 +25,15 @@ namespace OpenMS @brief Base class for digestion enzymes */ - class OPENMS_DLLAPI DigestionEnzyme - { - public: + class OPENMS_DLLAPI DigestionEnzyme + { + public: - /** @name Constructors + /** @name Constructors */ - //@{ + //@{ + /// default constructor + DigestionEnzyme(); /// Copy constructor DigestionEnzyme(const DigestionEnzyme&) = default; @@ -44,14 +46,6 @@ namespace OpenMS const std::set& synonyms = std::set(), String regex_description = ""); - /// Detailed constructor 2 - explicit DigestionEnzyme(const String& name, - String cut_before, - const String& nocut_after = "", - String sense = "C", - const std::set& synonyms = std::set(), - String regex_description = ""); - /// Destructor virtual ~DigestionEnzyme(); //@} @@ -128,8 +122,6 @@ namespace OpenMS protected: - /// default constructor - DigestionEnzyme(); // basic String name_; @@ -164,4 +156,3 @@ namespace std } }; } // namespace std - diff --git a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h index 394100355fe..036dc9204c5 100644 --- a/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h +++ b/src/openms/include/OpenMS/CHEMISTRY/DigestionEnzymeProtein.h @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -23,7 +23,7 @@ namespace OpenMS public DigestionEnzyme { public: - + enum Sense {C_TERM,N_TERM}; /** @name Constructors */ //@{ @@ -53,7 +53,14 @@ namespace OpenMS Int msgf_id = -1, Int omssa_id = -1); - /// Destructor + explicit DigestionEnzymeProtein(const String& name, + String cut_before, + Sense sense, + const String& nocut_after = "", + const std::set& synonyms = std::set(), + String regex_description = ""); + + /// Destructor ~DigestionEnzymeProtein() override; //@} @@ -159,10 +166,11 @@ namespace OpenMS Int omssa_id_; + String buildRegex_(String& cut_before, const String& nocut_after,const DigestionEnzymeProtein::Sense& sense); }; + OPENMS_DLLAPI std::ostream& operator<<(std::ostream& os, const DigestionEnzymeProtein& enzyme); typedef DigestionEnzymeProtein Protease; } - diff --git a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h index b536b055236..cc9ddc14064 100644 --- a/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h +++ b/src/openms/include/OpenMS/FORMAT/BrukerTimsFile.h @@ -52,13 +52,31 @@ namespace OpenMS double calibration_tolerance = 0.0; ///< m/z recalibration tolerance in Da (0 = default 0.1 Da) bool calibrate = false; ///< Enable m/z recalibration (off by default; may fail on some datasets) + bool load_ms1 = true; ///< Load MS1 spectra. Disable (false) for MS2-only workflows + ///< (peptide database search) where MS1 surveys are not needed — + ///< cuts memory and time substantially. Affects all export modes. + float ms1_centroid_mz_ppm = 0.0f; ///< MS1 IM-centroiding m/z tolerance in ppm (0 = disabled, suggested: 5.0). Adapted from Sage (Lazear 2023, doi:10.1021/acs.jproteome.3c00486). float ms1_centroid_im_pct = 0.0f; ///< MS1 IM-centroiding ion mobility tolerance in percent (0 = disabled, suggested: 3.0) + int ms1_centroid_max_peaks = 100000; ///< Upper bound on centroided peaks per MS1 spectrum. Caps retention at the top-intensity peaks; low-intensity tail is dropped when the limit is hit (a warning is emitted at that point). Sized for aggregated MS1 where frame-stacked grids commonly exceed 200k peaks; per-frame MS1 rarely hits even 10k. int dia_ms2_n_neighbors = 0; ///< DIA MS2 frame aggregation: number of adjacent frames on each side (0 = disabled, 1 = 3-frame sum, 2 = 5-frame sum) int dia_ms2_min_support = 1; ///< DIA MS2 denoising: minimum occupied neighbors in 3x3 (mz x IM) grid to retain a point (center excluded) bool dia_ms2_centroid = false; ///< DIA MS2 2D peak picking: apply Gaussian smoothing + local maxima detection to produce IM_CENTROIDED spectra + int ms1_n_neighbors = 0; ///< MS1 frame aggregation: adjacent MS1 frames on each side + ///< (0 = disabled, 1 = 3-frame sum, 2 = 5-frame sum). Applies + ///< to both DIA and DDA; ignored in FRAME export mode. + int ms1_min_support = 0; ///< MS1 denoising after aggregation: min occupied 3x3 neighbors + ///< (0 = disabled). Only effective when ms1_n_neighbors > 0. + ///< Differs from dia_ms2_min_support (default 1) to preserve + ///< zero-change defaults for the MS1 path. + double ms1_max_rt_distance_sec = 0.0; ///< Advanced: cap the RT distance (seconds) between a neighbor + ///< MS1 frame and the center frame during aggregation + ///< (0.0 = no cap). The center frame is always included + ///< regardless of this cap. Recommended for DDA where MS1 + ///< frame cadence is irregular. + enum ExportMode { AUTO, SPECTRUM, FRAME }; ExportMode export_mode = AUTO; ///< AUTO detects DDA vs DIA; SPECTRUM forces per-precursor; FRAME returns raw 4D frames diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h deleted file mode 100644 index f09bf6f17d9..00000000000 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResampler.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin -// SPDX-License-Identifier: BSD-3-Clause -// -// -------------------------------------------------------------------------- -// $Maintainer: Timo Sachsenberg $ -// $Authors: Eva Lange $ -// -------------------------------------------------------------------------- - -#pragma once - -#include -#include -#include -#include - -#include -#include - -namespace OpenMS -{ - /** - @brief Linear Resampling of raw data. - - This class can be used to generate uniform data from non-uniform raw data (e.g. ESI-TOF or MALDI-TOF experiments). - Therefore the intensity at every position x in the input raw data is spread to the two - adjacent resampling points. - This method preserves the area of the input signal and also the centroid position of a peak. - Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. - - @note Use this method only for high resolution data (< 0.1 Th between two adjacent raw data points). - The resampling rate should be >= the precision. - - @htmlinclude OpenMS_LinearResampler.parameters - */ - class OPENMS_DLLAPI LinearResampler : - public DefaultParamHandler, - public ProgressLogger - { - -public: - - /// Constructor - LinearResampler() : - DefaultParamHandler("LinearResampler") - { - defaults_.setValue("spacing", 0.05, "Spacing of the resampled output peaks."); - defaultsToParam_(); - } - - /// Destructor. - ~LinearResampler() override - { - } - - /** - @brief Applies the resampling algorithm to an MSSpectrum, without alignment between spectra. - */ - void raster(MSSpectrum& spectrum) const - { - //return if nothing to do - if (spectrum.empty()) return; - - typename MSSpectrum::iterator first = spectrum.begin(); - typename MSSpectrum::iterator last = spectrum.end(); - - double end_pos = (last - 1)->getMZ(); - double start_pos = first->getMZ(); - int number_raw_points = static_cast(spectrum.size()); - int number_resampled_points = static_cast(ceil((end_pos - start_pos) / spacing_ + 1)); - - std::vector resampled_peak_container; - resampled_peak_container.resize(number_resampled_points); - - // generate the resampled peaks at positions origin+i*spacing_ - std::vector::iterator it = resampled_peak_container.begin(); - for (int i = 0; i < number_resampled_points; ++i) - { - it->setMZ(start_pos + i * spacing_); - ++it; - } - - // spread the intensity h of the data point at position x to the left and right - // adjacent resampled peaks - double distance_left = 0.; - double distance_right = 0.; - int left_index = 0; - int right_index = 0; - - it = resampled_peak_container.begin(); - for (int i = 0; i < number_raw_points; ++i) - { - int help = static_cast(floor(((first + i)->getMZ() - start_pos) / spacing_)); - left_index = (help < 0) ? 0 : help; - help = number_resampled_points - 1; - right_index = (left_index >= help) ? help : left_index + 1; - - // boundary case: raw point falls on (or is clamped to) the last resampled - // point so left_index == right_index. Both interpolation distances are 0, - // which would lose the entire intensity. Assign it directly instead. - if (left_index == right_index) - { - double intensity = static_cast((it + left_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()); - (it + left_index)->setIntensity(intensity); - } - else - { - // compute the distance between x and the left adjacent resampled peak - distance_left = fabs((first + i)->getMZ() - (it + left_index)->getMZ()) / spacing_; - // compute the distance between x and the right adjacent resampled peak - distance_right = fabs((first + i)->getMZ() - (it + right_index)->getMZ()); - - // add the distance_right*h to the left resampled peak and distance_left*h to the right resampled peak - double intensity = static_cast((it + left_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()) * distance_right / spacing_; - (it + left_index)->setIntensity(intensity); - intensity = static_cast((it + right_index)->getIntensity()); - intensity += static_cast((first + i)->getIntensity()) * distance_left; - (it + right_index)->setIntensity(intensity); - } - } - - spectrum.swap(resampled_peak_container); - } - - /** - @brief Resamples the data in an MSExperiment, without alignment between spectra. - */ - void rasterExperiment(PeakMap& exp) - { - startProgress(0, exp.size(), "resampling of data"); - for (Size i = 0; i < exp.size(); ++i) - { - raster(exp[i]); - setProgress(i); - } - endProgress(); - } - -protected: - - /// Spacing of the resampled data - double spacing_; - - void updateMembers_() override - { - spacing_ = param_.getValue("spacing"); - } - - }; - - -} // namespace OpenMS - diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h index 82cb2b224a6..052cee86c1a 100644 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h +++ b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/LinearResamplerAlign.h @@ -3,13 +3,18 @@ // // -------------------------------------------------------------------------- // $Maintainer: Hannes Roest $ -// $Authors: Hannes Roest $ +// $Authors: Hannes Roest, Luis Jacob Keller, Alen Saric$ // -------------------------------------------------------------------------- #pragma once -#include +#include +#include #include +#include +#include +#include +#include namespace OpenMS { @@ -21,20 +26,24 @@ namespace OpenMS Therefore the intensity at every position x in the input raw data is spread to the two adjacent resampling points. This method preserves the area of the input signal and also the centroid position of a peak. - Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. - In addition to the LinearResampler, this class also allows to fix the + This class also allows to fix the points at which resampling will occur. This is useful if the resampling points are known in advance, e.g. if one needs to resample a chromatogram at the positions of another chromatogram. + Therefore it is recommended for quantitation as well as for ProteinIdentification experiments. + + @warning If the configured @p spacing is smaller than the minimal distance between two adjacent + input data points, intensity redistribution may produce spurious peaks. This can occur when + the sampling rate is finer than the detector's dead time. */ class OPENMS_DLLAPI LinearResamplerAlign : - public LinearResampler + public DefaultParamHandler, + public ProgressLogger { -public: - - LinearResamplerAlign() + public: + LinearResamplerAlign() : DefaultParamHandler("LinearResamplerAlign") { defaults_.setValue("spacing", 0.05, "Spacing of the resampled output peaks."); defaults_.setValue("ppm", "false", "Whether spacing is in ppm or Th"); @@ -73,7 +82,6 @@ namespace OpenMS /** @brief Applies the resampling algorithm to a container (MSSpectrum or MSChromatogram) with fixed coordinates. - The container will be resampled at equally spaced points between the supplied start and end positions. The resampling frequency can be controlled by the "spacing" parameter. @@ -102,6 +110,8 @@ namespace OpenMS auto first = container.begin(); auto last = container.end(); + verifySpacing_(first, last, [](auto x){return x->getPos();}); + // get the iterators just before / after the two points start_pos / end_pos while (first != container.end() && (first)->getPos() < start_pos) {++first;} while (last != first && (last - 1)->getPos() > end_pos) {--last;} @@ -143,6 +153,8 @@ namespace OpenMS OPENMS_PRECONDITION(resampled_begin != resampled_end, "Output iterators cannot be identical") // as we use +1 // OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") + verifySpacing_(raw_it, raw_end, [](auto x){return x->getPos();}); + PeakTypeIterator resample_start = resampled_begin; // need to get the raw iterator between two resampled iterators of the raw data @@ -220,6 +232,8 @@ namespace OpenMS "Raw m/z and intensity iterators need to cover the same distance") // OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") + verifySpacing_(mz_raw_it, mz_raw_end, [](auto x){return *x;}); + PeakTypeIterator mz_resample_start = mz_resample_it; // need to get the raw iterator between two resampled iterators of the raw data @@ -287,6 +301,8 @@ namespace OpenMS // OPENMS_PRECONDITION(resampled_start != resampled_end, "Output iterators cannot be identical") OPENMS_PRECONDITION(raw_it != raw_end, "Input iterators cannot be identical") // as we use +1 + verifySpacing_(raw_it, raw_end, [](PeakTypeIterator x){return x->getPos();}); + PeakTypeIterator raw_start = raw_it; // need to get the resampled iterator between two iterators of the raw data @@ -309,9 +325,31 @@ namespace OpenMS } + /** + @brief Applies the resampling algorithm to all spectra of an MSExperiment. + + Chromatograms stored in @p exp are not resampled. + + @param[in,out] exp The experiment whose spectra will be resampled in place. + */ + void rasterExperiment(PeakMap& exp) + { + startProgress(0, exp.size(), "resampling of data"); + for (Size i = 0; i < exp.size(); ++i) + { + raster(exp[i]); + setProgress(i); + } + endProgress(); + } + + protected: /// Spacing of the resampled data + double spacing_; + + /// Whether spacing_ is interpreted in ppm (true) or Th (false) bool ppm_; void updateMembers_() override @@ -352,8 +390,33 @@ namespace OpenMS } } } - }; -} + /// helper function to warn the user when the resampling rate is too high + template + void verifySpacing_(PeakTypeIterator it, PeakTypeIterator end, auto access) + { + // ppm_ spacing is relative (parts-per-million) and cannot be compared + // directly against the absolute neighbour distance computed below. + if (ppm_) return; + if (it == end || std::next(it) == end) return; + double min_dist = std::numeric_limits::infinity(); + double current_dist{}; + + while (std::next(it) != end) + { + current_dist = (access(std::next(it)) - access(it)); + if (min_dist > current_dist) min_dist = current_dist; + ++it; + } + + if (spacing_ < min_dist) + { + OPENMS_LOG_WARN << "Resampling spacing (" << spacing_ + << ") is smaller than the smallest distance between data points (" + << min_dist << "). This approximates the detector dead time and may produce spurious peaks.\n"; + } + } + }; +} diff --git a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake index f6648e1bea9..600b7b27d0e 100644 --- a/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake +++ b/src/openms/include/OpenMS/PROCESSING/RESAMPLING/sources.cmake @@ -3,7 +3,6 @@ set(directory include/OpenMS/PROCESSING/RESAMPLING) ### list all header files of the directory here set(sources_list_h -LinearResampler.h LinearResamplerAlign.h ) diff --git a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp index e4177f9af89..7fad9dca3a5 100644 --- a/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp +++ b/src/openms/source/ANALYSIS/ID/ProSEAlgorithm.cpp @@ -270,6 +270,13 @@ namespace OpenMS // Open search mode is automatically determined based on precursor tolerance in isOpenSearchMode_() + add_a_ions_ = param_.getValue("ions:add_a_ions").toBool(); + add_b_ions_ = param_.getValue("ions:add_b_ions").toBool(); + add_c_ions_ = param_.getValue("ions:add_c_ions").toBool(); + add_x_ions_ = param_.getValue("ions:add_x_ions").toBool(); + add_y_ions_ = param_.getValue("ions:add_y_ions").toBool(); + add_z_ions_ = param_.getValue("ions:add_z_ions").toBool(); + calibration_enabled_ = param_.getValue("calibration:enabled") == "true"; calibration_subset_ratio_ = param_.getValue("calibration:subset_ratio"); calibration_min_psms_ = param_.getValue("calibration:min_psms"); @@ -824,11 +831,19 @@ namespace OpenMS // call sites below read from this field instead of re-computing. last_mod_match_tolerance_used_ = computeModMatchTolerance_(); - // create spectrum generator + // create spectrum generator — forward the ion-series toggles so scoring + // uses the same ion types as the FragmentIndex (which already reads them + // via setParameters in prepareContext). TheoreticalSpectrumGenerator spectrum_generator; Param param(spectrum_generator.getParameters()); param.setValue("add_first_prefix_ion", "true"); param.setValue("add_metainfo", "true"); + param.setValue("add_a_ions", add_a_ions_ ? "true" : "false"); + param.setValue("add_b_ions", add_b_ions_ ? "true" : "false"); + param.setValue("add_c_ions", add_c_ions_ ? "true" : "false"); + param.setValue("add_x_ions", add_x_ions_ ? "true" : "false"); + param.setValue("add_y_ions", add_y_ions_ ? "true" : "false"); + param.setValue("add_z_ions", add_z_ions_ ? "true" : "false"); spectrum_generator.setParameters(param); // preallocate storage for PSMs @@ -1660,12 +1675,16 @@ namespace OpenMS if (!result.extreme_bias) { - // Map signed window [shift - spread, shift + spread] to positive magnitudes: - // -cal_lower = shift - spread -> cal_lower = spread - shift - // +cal_upper = shift + spread -> cal_upper = spread + shift + // Map signed window [shift - spread, shift + spread] to the positive-magnitude + // (lower, upper) schema. Per the convention established above (lines ~1615-1617), + // a (lower, upper) pair means theoretical ∈ [observed - lower, observed + upper], + // equivalently the signed error e = observed - theoretical lies in [-upper, +lower]. + // Matching [shift - spread, shift + spread] against [-upper, +lower] gives: + // -cal_upper = shift - spread -> cal_upper = spread - shift + // +cal_lower = shift + spread -> cal_lower = spread + shift // Both are non-negative when |shift| < spread. - const double cal_lower_raw = result.precursor_spread - prec_shift; - const double cal_upper_raw = result.precursor_spread + prec_shift; + const double cal_lower_raw = result.precursor_spread + prec_shift; + const double cal_upper_raw = result.precursor_spread - prec_shift; // Only tighten - cap against user-configured bounds. result.cal_lower = std::min(cal_lower_raw, precursor_mass_tolerance_lower_); result.cal_upper = std::min(cal_upper_raw, precursor_mass_tolerance_upper_); diff --git a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp index 1ea653c265a..25ac12af45d 100644 --- a/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp +++ b/src/openms/source/CHEMISTRY/DigestionEnzyme.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -36,58 +36,6 @@ namespace OpenMS { } - DigestionEnzyme::DigestionEnzyme(const String& name, - String cut_before, - const String& nocut_after, - String sense, - const std::set& synonyms, - String regex_description) : - name_(name), - synonyms_(synonyms), - regex_description_(std::move(regex_description)) - { - //TODO check if all letters are A-Z? - if (cut_before.empty()) - { - //Maybe assertion? - throw Exception::MissingInformation( - __FILE__, - __LINE__, - OPENMS_PRETTY_FUNCTION, - "No cleavage position given when trying to construct a DigestionEnzyme."); - } - else if (!cut_before.hasSuffix("X")) - { - //TODO think about this - cut_before = cut_before + "X"; - } - cleavage_regex_ = ""; - if (sense.toLower() == "c") - { - cleavage_regex_ += "(?<=[" + cut_before + "]"; - if (!nocut_after.empty()) - { - cleavage_regex_ += "(?!" + nocut_after + "])"; - } - } - else if (sense.toLower() == "n") - { - if (!nocut_after.empty()) - { - cleavage_regex_ += "(?& synonyms, + String regex_description): + DigestionEnzyme(name, buildRegex_(cut_before, nocut_after, sense), synonyms, std::move(regex_description)) + { + } DigestionEnzymeProtein::~DigestionEnzymeProtein() = default; @@ -210,6 +219,67 @@ namespace OpenMS return false; } + String DigestionEnzymeProtein::buildRegex_(String& cut_before, const String& nocut_after, const DigestionEnzymeProtein::Sense& sense) + { + if (cut_before.empty()) + { + throw Exception::MissingInformation( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "No cleavage position given when trying to construct a DigestionEnzyme."); + } + + for(char c : cut_before) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "Amino Acids for cleavage contain unknown character: " + String(c)); + } + } + + for(char c : nocut_after) + { + if (c > 'Z' || c < 'A') + { + throw Exception::InvalidParameter( + __FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + "Amino Acids to stop cleavage contain unknown character: " + String(c)); + } + } + + if (!cut_before.hasSuffix("X")) + { + cut_before += "X"; + } + + String result = ""; + if (sense == DigestionEnzymeProtein::Sense::C_TERM) + { + result = "(?<=[" + cut_before + "])"; + if (!nocut_after.empty()) + { + result += "(?!" + nocut_after + "])"; + } + } + else if (sense == DigestionEnzymeProtein::Sense::N_TERM) + { + if (!nocut_after.empty()) + { + result = "(?(enzyme) << " " @@ -218,4 +288,3 @@ namespace OpenMS } } - diff --git a/src/openms/source/FORMAT/BrukerTimsFile.cpp b/src/openms/source/FORMAT/BrukerTimsFile.cpp index ede0e2a84f6..05624d5d052 100644 --- a/src/openms/source/FORMAT/BrukerTimsFile.cpp +++ b/src/openms/source/FORMAT/BrukerTimsFile.cpp @@ -31,6 +31,7 @@ #include #include #include +#include namespace OpenMS { @@ -148,7 +149,12 @@ namespace OpenMS float im; }; - static constexpr size_t MAX_CENTROID_PEAKS = 10000; + // Default cap on the number of centroided peaks per frame. Overridable via + // BrukerTimsFile::Config::ms1_centroid_max_peaks. For per-frame MS1 (~200k + // peaks) the 10k default covers the top-intensity peaks well; for aggregated + // MS1 where the frame-stacked grid can exceed 600k peaks, users should raise + // this (100k is the new CLI default). + static constexpr size_t DEFAULT_CENTROID_MAX_PEAKS = 10000; // Reusable buffer for IM-dimension centroiding of a single frame. // Translated from Sage's PeakBuffer (Lazear 2023, doi:10.1021/acs.jproteome.3c00486). @@ -159,6 +165,12 @@ namespace OpenMS std::vector order; // indices sorted by descending intensity std::vector agg_buff; // centroided output + // Cap-hit tracking across frames within one load() call. Emitted as a + // single summary line at the end of the load via reportCapSummary(). + size_t cap_hits_ = 0; + size_t total_calls_ = 0; + float max_dropped_intensity_ = 0.0f; + void clear() { peaks.clear(); @@ -196,17 +208,46 @@ namespace OpenMS }); } + // Overload: accepts double* intensities (from FrameAggregator::finalize() output, + // where summed intensities across aggregated frames can exceed uint32_t range). + void loadFrame(const double* mz_values, const double* intensities, + const double* im_values, uint32_t count) + { + clear(); + peaks.reserve(count); + for (uint32_t i = 0; i < count; ++i) + { + peaks.push_back({static_cast(mz_values[i]), + static_cast(intensities[i]), + static_cast(im_values[i])}); + } + + std::sort(peaks.begin(), peaks.end(), + [](const ImsPeak& a, const ImsPeak& b) { return a.mz < b.mz; }); + + order.resize(count); + std::iota(order.begin(), order.end(), size_t(0)); + std::sort(order.begin(), order.end(), + [this](size_t a, size_t b) + { + return peaks[b].intensity < peaks[a].intensity; + }); + } + // Centroid the loaded frame by collapsing the IM dimension. // Iterates peaks in descending intensity order. For each apex peak, aggregates // all unconsumed neighbors within m/z (ppm) and IM (percent) tolerances. - // Output is capped at MAX_CENTROID_PEAKS entries. + // Output is capped at max_peaks entries (dropped peaks logged at WARN level + // when high-intensity, at DEBUG otherwise). void centroid(float mz_ppm, float im_pct, std::vector& out_mz, std::vector& out_intensity, - std::vector& out_im) + std::vector& out_im, + size_t max_peaks = DEFAULT_CENTROID_MAX_PEAKS) { agg_buff.clear(); - agg_buff.reserve(std::min(peaks.size(), MAX_CENTROID_PEAKS + 1)); + agg_buff.reserve(std::min(peaks.size(), max_peaks + 1)); + ++total_calls_; const float utol = mz_ppm / 1e6f; const float im_tol_frac = im_pct / 100.0f; @@ -216,13 +257,16 @@ namespace OpenMS { if (peaks[idx].intensity <= 0.0f) continue; // already consumed - if (agg_buff.size() > MAX_CENTROID_PEAKS) + if (agg_buff.size() > max_peaks) { + // Only count as a "real" cap hit if the next dropped peak is above + // the noise floor (~200 counts on timsTOF). Dropping noise-floor + // peaks is the expected behavior of the cap and not worth reporting. if (peaks[idx].intensity > 200.0f) { - OPENMS_LOG_DEBUG << "FrameCentroider: reached MAX_CENTROID_PEAKS at index " - << idx << "/" << peaks.size() - << " intensity=" << peaks[idx].intensity << std::endl; + ++cap_hits_; + if (peaks[idx].intensity > max_dropped_intensity_) + max_dropped_intensity_ = peaks[idx].intensity; } break; } @@ -278,6 +322,25 @@ namespace OpenMS out_im[i] = agg_buff[i].im; } } + + // Emit one summary WARN if the cap fired on any frames during this load() + // call. Called by each top-level load path (loadDIA_, loadDDA_, + // loadDIAStreaming, loadFrames_) after all centroiding is done. + void reportCapSummary(size_t max_peaks) + { + if (cap_hits_ == 0) return; + OPENMS_LOG_WARN << "Warning: MS1 centroiding hit ms1_centroid_max_peaks (=" + << max_peaks << ") on " << cap_hits_ << " of " + << total_calls_ << " spectra; highest dropped-peak " + << "intensity was " << max_dropped_intensity_ + << ". Consider raising -bruker:ms1_centroid_max_peaks " + << "to retain more peaks." << std::endl; + // Reset so successive load() calls on the same instance don't carry + // state forward. + cap_hits_ = 0; + total_calls_ = 0; + max_dropped_intensity_ = 0.0f; + } }; // Helper: check if MS1 centroiding is enabled, warn on partial config @@ -294,6 +357,38 @@ namespace OpenMS return has_mz && has_im; } + // Emit one-shot partial-config warnings for the MS1 aggregation knobs. + // Called once per top-level load path, after ms1_frame_ids is populated. + static void warnPartialMS1AggregationConfig(const BrukerTimsFile::Config& config, + size_t ms1_frame_count) + { + if (config.ms1_min_support > 0 && config.ms1_n_neighbors == 0) + { + OPENMS_LOG_WARN << "Warning: ms1_min_support (=" << config.ms1_min_support + << ") ignored: ms1_n_neighbors=0 disables aggregation." << std::endl; + } + if (config.ms1_max_rt_distance_sec > 0.0 && config.ms1_n_neighbors == 0) + { + OPENMS_LOG_WARN << "Warning: ms1_max_rt_distance_sec (=" + << config.ms1_max_rt_distance_sec + << ") ignored: ms1_n_neighbors=0 disables aggregation." << std::endl; + } + if (config.ms1_n_neighbors > 0 && ms1_frame_count > 0 && + static_cast(2 * config.ms1_n_neighbors + 1) > ms1_frame_count) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") exceeds half the run's MS1 frame count (=" + << ms1_frame_count << "); effective window will be clamped " + << "at run edges." << std::endl; + } + if (!config.load_ms1 && config.ms1_n_neighbors > 0) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") ignored: load_ms1=false disables MS1 loading entirely." + << std::endl; + } + } + // ===================================================================== // TdfMetadataReader: private helper functions for SQL-based metadata // ===================================================================== @@ -330,13 +425,14 @@ namespace OpenMS return false; } - /// Helper for DIA MS2 frame aggregation and denoising. - /// Bins peaks from multiple frames onto a sparse (mz_bin, scan_id) grid, - /// applies spatial denoising, and outputs the surviving peaks. - class DIAFrameAggregator + /// Grid-based aggregator for stacking peaks from adjacent frames into a sparse + /// (mz_bin, scan_id) grid, with optional 3x3 spatial denoising. Used by both + /// DIA MS2 aggregation and MS1 aggregation (with different bin widths). + class FrameAggregator { public: - static constexpr double MZ_BIN_WIDTH = 0.02; // Da — absorbs frame-to-frame m/z jitter + explicit FrameAggregator(double mz_bin_width = 0.02) + : mz_bin_width_(mz_bin_width) {} struct OutputPeak { @@ -345,10 +441,22 @@ namespace OpenMS uint32_t scan_id; // native scan index (for IM conversion) }; + /// Returns the grid key of the neighbor cell at signed (dm, ds) offset. + /// Returns std::nullopt when the offset would wrap past 0 on either axis. + static inline std::optional + neighborKey(uint32_t mz_bin, uint32_t scan_id, int dm, int ds) + { + if (dm < 0 && mz_bin < static_cast(-dm)) return std::nullopt; + if (ds < 0 && scan_id < static_cast(-ds)) return std::nullopt; + uint32_t new_mz = mz_bin + static_cast(dm); + uint32_t new_scan = scan_id + static_cast(ds); + return (static_cast(new_mz) << 32) | new_scan; + } + /// Add a peak to the grid. Call for every peak from every neighbor frame. void addPeak(double mz, uint32_t intensity, uint32_t scan_id) { - int64_t mz_bin = static_cast(std::round(mz / MZ_BIN_WIDTH)); + int64_t mz_bin = static_cast(std::round(mz / mz_bin_width_)); uint64_t key = (static_cast(static_cast(mz_bin)) << 32) | scan_id; auto& cell = grid_[key]; @@ -379,9 +487,10 @@ namespace OpenMS for (int ds = -1; ds <= 1; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - if (grid_.count(nkey)) ++neighbors; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + if (grid_.count(*nkey)) ++neighbors; + } } } @@ -419,9 +528,10 @@ namespace OpenMS for (int ds = -1; ds <= 1; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - if (grid_.count(nkey)) ++neighbors; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + if (grid_.count(*nkey)) ++neighbors; + } } } if (neighbors < min_support) continue; @@ -448,12 +558,13 @@ namespace OpenMS { for (int ds = -2; ds <= 2; ++ds) { - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - auto it = denoised.find(nkey); - if (it != denoised.end()) + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) { - weighted_sum += it->second.intensity_sum * kernel[dm + 2][ds + 2]; + auto it = denoised.find(*nkey); + if (it != denoised.end()) + { + weighted_sum += it->second.intensity_sum * kernel[dm + 2][ds + 2]; + } } } } @@ -472,10 +583,11 @@ namespace OpenMS for (int ds = -1; ds <= 1 && is_max; ++ds) { if (dm == 0 && ds == 0) continue; - uint64_t nkey = (static_cast(static_cast(mz_bin + dm)) << 32) - | (scan_id + ds); - auto it = smoothed.find(nkey); - if (it != smoothed.end() && it->second > val) is_max = false; + if (auto nkey = neighborKey(mz_bin, scan_id, dm, ds)) + { + auto it = smoothed.find(*nkey); + if (it != smoothed.end() && it->second > val) is_max = false; + } } } if (is_max) maxima.push_back(key); @@ -497,16 +609,17 @@ namespace OpenMS { for (int ds = -2; ds <= 2; ++ds) { - uint64_t nkey = (static_cast(static_cast(center_mz_bin + dm)) << 32) - | (center_scan + ds); - auto it = denoised.find(nkey); - if (it != denoised.end()) + if (auto nkey = neighborKey(center_mz_bin, center_scan, dm, ds)) { - double int_val = it->second.intensity_sum; - double mz_val = it->second.mz_weighted_sum / it->second.intensity_sum; - total_intensity += int_val; - mz_weighted += mz_val * int_val; - scan_weighted += static_cast(static_cast(nkey & 0xFFFFFFFF)) * int_val; + auto it = denoised.find(*nkey); + if (it != denoised.end()) + { + double int_val = it->second.intensity_sum; + double mz_val = it->second.mz_weighted_sum / it->second.intensity_sum; + total_intensity += int_val; + mz_weighted += mz_val * int_val; + scan_weighted += static_cast(static_cast(*nkey & 0xFFFFFFFF)) * int_val; + } } } } @@ -524,7 +637,13 @@ namespace OpenMS /// Clear grid for reuse void clear() { grid_.clear(); } + /// Pre-reserve bucket space for N cells. Call once before a hot loop + /// to avoid rehash cascades. Safe to over-allocate — clear() preserves + /// the bucket array across center frames. + void reserve(size_t n) { grid_.reserve(n); } + private: + double mz_bin_width_; struct Cell { double intensity_sum = 0.0; @@ -911,6 +1030,9 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(ms_level); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + // TDF peaks are detector-centroided in m/z (peak-list format); label as + // centroid so downstream tools (e.g. CometAdapter) don't reject as profile. + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id)); if (frame.num_peaks == 0) return; @@ -976,7 +1098,8 @@ namespace OpenMS std::vector cent_mz, cent_intensity; std::vector cent_im; centroider.centroid(config.ms1_centroid_mz_ppm, config.ms1_centroid_im_pct, - cent_mz, cent_intensity, cent_im); + cent_mz, cent_intensity, cent_im, + static_cast(config.ms1_centroid_max_peaks)); // Build MSSpectrum from centroided output spec.reserve(cent_mz.size()); @@ -996,6 +1119,154 @@ namespace OpenMS spec.setIMPeakType(IMPeakType::IM_CENTROIDED); } + // ===================================================================== + // Helper: build one MS1 spectrum from a frame, with optional IM centroiding + // ===================================================================== + static void loadMS1Spectrum(TimsFrame& frame, MSSpectrum& spec, + const BrukerTimsFile::Config& config, + FrameCentroider& centroider) + { + if (isCentroidingEnabled(config)) + centroidMS1Frame(frame, spec, config, centroider); + else + frameToSpectrum(frame, spec, 1); + } + + // ===================================================================== + // Helper: build one aggregated MS1 spectrum from a sliding window of + // adjacent MS1 frames. Composes with within-frame IM centroiding + // (FrameCentroider) when ms1_centroid_mz_ppm/pct are set. + // + // Pre: center_idx is the output-emitting frame; N and RT cap come from + // config. Post: spec is populated with center-frame metadata and + // per-peak IM data. IMPeakType is IM_PROFILE or IM_CENTROIDED based on + // isCentroidingEnabled(config). + // ===================================================================== + static void loadAggregatedMS1Spectrum( + TimsDataHandle& handle, + const std::vector& ms1_frame_ids, + size_t center_idx, + const BrukerTimsFile::Config& config, + FrameAggregator& aggregator, + FrameCentroider& centroider, + MSSpectrum& spec) + { + const int N = config.ms1_n_neighbors; + const size_t size = ms1_frame_ids.size(); + const size_t lo = (static_cast(N) > center_idx) ? 0 : center_idx - N; + const size_t hi = std::min(center_idx + static_cast(N), size - 1); + + TimsFrame& center_frame = handle.get_frame(ms1_frame_ids[center_idx]); + + aggregator.clear(); + size_t contributing_frames = 0; + + for (size_t ni = lo; ni <= hi; ++ni) + { + TimsFrame& nframe = handle.get_frame(ms1_frame_ids[ni]); + + // Center is always included regardless of RT cap; neighbors are capped. + const bool is_center = (ni == center_idx); + if (!is_center && config.ms1_max_rt_distance_sec > 0.0 && + std::abs(nframe.time - center_frame.time) > config.ms1_max_rt_distance_sec) + { + continue; + } + + if (nframe.num_peaks == 0) continue; + + std::vector scan_ids(nframe.num_peaks); + std::vector intensities(nframe.num_peaks); + std::vector mzs(nframe.num_peaks); + + nframe.save_to_buffs(nullptr, scan_ids.data(), nullptr, intensities.data(), + mzs.data(), nullptr, nullptr); + + bool frame_contributed = false; + for (uint32_t p = 0; p < nframe.num_peaks; ++p) + { + aggregator.addPeak(mzs[p], intensities[p], scan_ids[p]); + frame_contributed = true; + } + if (frame_contributed) ++contributing_frames; + } + + const bool skip_denoise = (contributing_frames <= 1) || (config.ms1_min_support <= 0); + auto peaks = aggregator.finalize(config.ms1_min_support, skip_denoise); + + // Populate center-frame metadata regardless of whether peaks is empty — + // preserves the per-center-frame cadence invariant for streaming consumers. + spec.clear(true); + spec.setRT(center_frame.time); + spec.setMSLevel(1); + spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setNativeID("frame=" + String(center_frame.id)); + + DataArrays::FloatDataArray im_array; + IMDataConverter::setIMUnit(im_array, DriftTimeUnit::VSSC); + + if (peaks.empty()) + { + spec.getFloatDataArrays().push_back(std::move(im_array)); + spec.setIMPeakType(IMPeakType::IM_PROFILE); + return; + } + + if (isCentroidingEnabled(config)) + { + // Unpack aggregator output into parallel arrays for FrameCentroider. + std::vector cent_mzs(peaks.size()); + std::vector cent_intensities(peaks.size()); + std::vector cent_ims(peaks.size()); + for (size_t i = 0; i < peaks.size(); ++i) + { + cent_mzs[i] = peaks[i].mz; + cent_intensities[i] = peaks[i].intensity; + double im_val = 0.0; + handle.scan2inv_ion_mobility_converter->convert( + center_frame.id, &im_val, &peaks[i].scan_id, 1); + cent_ims[i] = im_val; + } + + centroider.loadFrame(cent_mzs.data(), cent_intensities.data(), + cent_ims.data(), static_cast(peaks.size())); + + std::vector out_mz, out_intensity; + std::vector out_im; + centroider.centroid(config.ms1_centroid_mz_ppm, config.ms1_centroid_im_pct, + out_mz, out_intensity, out_im, + static_cast(config.ms1_centroid_max_peaks)); + + spec.reserve(out_mz.size()); + for (size_t i = 0; i < out_mz.size(); ++i) + { + Peak1D peak; + peak.setMZ(out_mz[i]); + peak.setIntensity(static_cast(out_intensity[i])); + spec.push_back(peak); + } + im_array.assign(out_im.begin(), out_im.end()); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); + spec.setIMPeakType(IMPeakType::IM_CENTROIDED); + } + else + { + spec.reserve(peaks.size()); + im_array.reserve(peaks.size()); + for (const auto& peak : peaks) + { + spec.emplace_back(peak.mz, static_cast(peak.intensity)); + double im_val = 0.0; + handle.scan2inv_ion_mobility_converter->convert( + center_frame.id, &im_val, &peak.scan_id, 1); + im_array.push_back(static_cast(im_val)); + } + spec.setIMPeakType(IMPeakType::IM_PROFILE); + } + + spec.getFloatDataArrays().push_back(std::move(im_array)); + } + // ===================================================================== // isDIA_: detect DDA vs DIA by querying for SWATH window tables // ===================================================================== @@ -1111,8 +1382,8 @@ namespace OpenMS SQLite::Database db(std::string(tdf_path), SQLite::OPEN_READONLY); // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; + FrameAggregator ms1_aggregator(0.01); std::vector ms1_frame_ids; for (uint32_t fid = handle->min_frame_id(); fid <= handle->max_frame_id(); ++fid) @@ -1120,16 +1391,24 @@ namespace OpenMS if (handle->has_frame(fid) && handle->get_frame(fid).msms_type == 0) ms1_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); + if (config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); - startProgress(0, ms1_frame_ids.size(), "Streaming DIA-PASEF MS1 frames"); - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + startProgress(0, ms1_to_emit, "Streaming DIA-PASEF MS1 frames"); + for (size_t i = 0; i < ms1_to_emit; ++i) { - TimsFrame& frame = handle->get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) - centroidMS1Frame(frame, spec, config, centroider); + if (config.ms1_n_neighbors > 0) + { + loadAggregatedMS1Spectrum(*handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); + } else - frameToSpectrum(frame, spec, 1); + { + TimsFrame& frame = handle->get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); + } // Sort peaks by m/z (and the associated IM float data array alongside). // TIMS save_to_buffs returns peaks in (scan_id, m/z-within-scan) order, // which is NOT globally m/z-sorted. Downstream consumers @@ -1142,6 +1421,7 @@ namespace OpenMS setProgress(i); } endProgress(); + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); // --- MS2 frames: raw per-WindowGroup iteration (no aggregation) --- auto windows = readDIAWindows(db, *handle->scan2inv_ion_mobility_converter); @@ -1194,6 +1474,7 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; @@ -1292,24 +1573,40 @@ namespace OpenMS bool is_dia = isDIA(db); Config::ExportMode mode = config.export_mode; - // Compute expected size for consumer + // Compute expected size for consumer. Must mirror what loadFrames_/loadDIA_/ + // loadDDA_ actually emit, which skip MS1 entirely when load_ms1=false. FrameCounts counts = readFrameCounts(db); + const size_t ms1_emitted = config.load_ms1 ? counts.ms1 : 0; size_t expected = 0; if (mode == Config::FRAME) { - expected = counts.total; + expected = ms1_emitted + counts.ms2; } else if (is_dia && mode != Config::SPECTRUM) { - // DIA: MS1 frames + MS2 frames * windows + // DIA: each MS2 frame belongs to exactly ONE WindowGroup and only + // produces spectra for that group's windows. So the emit count is + // Σ_group (frames_in_group × windows_in_group) + // — NOT frames × all_windows. Mirrors the total_work computation in + // loadDIA_. auto windows = readDIAWindows(db, *handle->scan2inv_ion_mobility_converter); - expected = counts.ms1 + static_cast(counts.ms2) * windows.size(); + auto group_to_frames = readFrameToWindowGroupMapping(db, windows); + std::map group_window_count; + for (const auto& w : windows) ++group_window_count[w.window_group]; + size_t dia_ms2_spectra = 0; + for (const auto& [group, frames] : group_to_frames) + { + auto wit = group_window_count.find(group); + if (wit != group_window_count.end()) + dia_ms2_spectra += frames.size() * wit->second; + } + expected = ms1_emitted + dia_ms2_spectra; } else { // DDA: MS1 frames + per-precursor MS2 spectra uint32_t num_precursors = countDDAPrecursors(db); - expected = counts.ms1 + num_precursors; + expected = ms1_emitted + num_precursors; } consumer->setExpectedSize(expected, 0); @@ -1359,6 +1656,7 @@ namespace OpenMS else ms2_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); // Read DDA precursors from SQL std::vector precursor_entries = readDDAPrecursors(db); @@ -1371,24 +1669,27 @@ namespace OpenMS } uint32_t num_ms2 = static_cast(precursor_groups.size()); - exp.reserveSpaceSpectra(static_cast(ms1_frame_ids.size()) + num_ms2); + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + exp.reserveSpaceSpectra(static_cast(ms1_to_emit) + num_ms2); - startProgress(0, ms1_frame_ids.size() + num_ms2, "Loading DDA-PASEF data"); + startProgress(0, ms1_to_emit + num_ms2, "Loading DDA-PASEF data"); // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + FrameAggregator ms1_aggregator(0.01); + if (config.load_ms1 && config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); + for (size_t i = 0; i < ms1_to_emit; ++i) { - TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) + if (config.ms1_n_neighbors > 0) { - centroidMS1Frame(frame, spec, config, centroider); + loadAggregatedMS1Spectrum(handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); } else { - frameToSpectrum(frame, spec, 1); + TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); } exp.addSpectrum(std::move(spec)); setProgress(i); @@ -1589,7 +1890,7 @@ namespace OpenMS if (all_tofs.empty()) { ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); continue; } @@ -1669,7 +1970,7 @@ namespace OpenMS if (all_mz.empty()) { ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); continue; } @@ -1698,7 +1999,43 @@ namespace OpenMS spec.setMSLevel(2); spec.setDriftTime(scalar_im); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); - spec.setNativeID("scan=" + String(prec_id)); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); + // Native ID: "frame= scan= precursor=

". + // + // This extends the PSI-MS MS:1002818 "Bruker TDF nativeID format" + // pattern ("frame= scan=") with a trailing + // "precursor=" token. The extension is REQUIRED for + // uniqueness under OpenMS's per-precursor aggregation: + // + // - Unlike pwiz/msconvert (which emits one MS2 spectrum per + // Bruker PASEF mobility-scan row, where (frame, scan_begin) is + // inherently unique), this reader ports timsrust's strategy of + // merging all PasefFrameMsMsInfo entries sharing the same + // Precursors.Id (potentially across multiple frames) into ONE + // aggregated MS2 spectrum — see the loop above and commit + // 41ce6cfeb (PR #8975). first_entry->(frame_id, scan_begin) is + // just the min-ordered row of that group and is NOT guaranteed + // unique across distinct precursors (PASEF can co-isolate + // multiple precursors in the same (Frame, ScanNumBegin) mobility + // window differing only by IsolationMz; multi-cycle precursors + // can also collide on their first entry). + // + // - Strict MS:1002818 validators may flag the trailing token, but + // the ecosystem convention (pwiz combined mode uses + // "merged=N frame=F scanStart=A scanEnd=B" under the SAME + // MS:1002818 accession; our own DIA path emits + // "frame=F windowGroup=G scan=S") is that vendor-specific + // disambiguators are added while keeping MS:1002818. Comet and + // Sage dispatch on nativeID content, not the CV accession, so + // compatibility is preserved in practice. + // + // - Bruker Precursors.Id is also duplicated as a typed MetaValue + // "bruker_precursor_id" for direct programmatic lookup without + // parsing the nativeID string. + spec.setNativeID("frame=" + String(first_entry->frame_id) + + " scan=" + String(first_entry->scan_begin) + + " precursor=" + String(prec_id)); + spec.setMetaValue("bruker_precursor_id", static_cast(prec_id)); // Copy sorted peak data spec.reserve(all_mz.size()); @@ -1733,9 +2070,10 @@ namespace OpenMS exp.addSpectrum(std::move(spec)); ++precursor_idx; - setProgress(ms1_frame_ids.size() + precursor_idx); + setProgress(ms1_to_emit + precursor_idx); } endProgress(); + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); } // ===================================================================== @@ -1756,27 +2094,33 @@ namespace OpenMS if (frame.msms_type == 0) ms1_frame_ids.push_back(fid); } + warnPartialMS1AggregationConfig(config, ms1_frame_ids.size()); // --- MS1 frames --- - bool do_centroid = isCentroidingEnabled(config); FrameCentroider centroider; - startProgress(0, ms1_frame_ids.size(), "Loading DIA-PASEF MS1 frames"); - for (size_t i = 0; i < ms1_frame_ids.size(); ++i) + FrameAggregator ms1_aggregator(0.01); // finer bin for MS1 (see design spec) + const size_t ms1_to_emit = config.load_ms1 ? ms1_frame_ids.size() : 0; + if (config.load_ms1 && config.ms1_n_neighbors > 0) ms1_aggregator.reserve(300'000); + startProgress(0, ms1_to_emit, "Loading DIA-PASEF MS1 frames"); + for (size_t i = 0; i < ms1_to_emit; ++i) { - TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); MSSpectrum spec; - if (do_centroid) + if (config.ms1_n_neighbors > 0) { - centroidMS1Frame(frame, spec, config, centroider); + loadAggregatedMS1Spectrum(handle, ms1_frame_ids, i, config, + ms1_aggregator, centroider, spec); } else { - frameToSpectrum(frame, spec, 1); + TimsFrame& frame = handle.get_frame(ms1_frame_ids[i]); + loadMS1Spectrum(frame, spec, config, centroider); } exp.addSpectrum(std::move(spec)); setProgress(i); } endProgress(); + if (config.load_ms1) + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); // --- SWATH windows --- std::vector windows = readDIAWindows(db, *handle.scan2inv_ion_mobility_converter); @@ -1818,7 +2162,7 @@ namespace OpenMS Size progress_count = 0; const int N = config.dia_ms2_n_neighbors; - DIAFrameAggregator aggregator; + FrameAggregator aggregator; for (const auto& [group, frame_ids] : group_to_frames) { @@ -1832,6 +2176,7 @@ namespace OpenMS { setProgress(progress_count++); aggregator.clear(); + size_t contributing_frames = 0; // Determine neighbor range size_t lo = (i >= static_cast(N)) ? i - N : 0; @@ -1850,24 +2195,24 @@ namespace OpenMS nframe.save_to_buffs(nullptr, scan_ids.data(), nullptr, intensities.data(), mzs.data(), nullptr, nullptr); + bool frame_contributed = false; for (uint32_t p = 0; p < nframe.num_peaks; ++p) { // Filter by scan bounds (integer comparison, no IM conversion needed) if (scan_ids[p] >= win->scan_begin && scan_ids[p] <= win->scan_end) { aggregator.addPeak(mzs[p], intensities[p], scan_ids[p]); + frame_contributed = true; } } + if (frame_contributed) ++contributing_frames; } - // Denoise (skip if only 1 frame in range) - // TODO: This checks the index range, not the actual number of neighbor - // frames that contributed peaks to the grid. If neighbor frames exist - // but have zero peaks passing the IM filter for this window, the grid - // contains only single-frame data yet denoising still runs — which may - // remove valid isolated peaks. Consider counting actual contributing - // frames if this becomes an issue in practice. - bool skip_denoise = (hi - lo) < 1; + // Denoise skipped when the grid was populated from a single frame (or none), + // because spatial denoising requires neighbor context that only cross-frame + // aggregation provides. Also skipped when min_support <= 0 (user explicitly + // disabled the filter). + bool skip_denoise = (contributing_frames <= 1) || config.dia_ms2_min_support <= 0; auto peaks = config.dia_ms2_centroid ? aggregator.finalizeCentroided(config.dia_ms2_min_support, skip_denoise) : aggregator.finalize(config.dia_ms2_min_support, skip_denoise); @@ -1880,6 +2225,7 @@ namespace OpenMS spec.setRT(center_frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(center_frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; @@ -1944,6 +2290,7 @@ namespace OpenMS spec.setRT(frame.time); spec.setMSLevel(2); spec.setDriftTimeUnit(DriftTimeUnit::VSSC); + spec.setType(SpectrumSettings::SpectrumType::CENTROID); spec.setNativeID("frame=" + String(frame.id) + " windowGroup=" + String(win->window_group) + " scan=" + String(win->scan_begin)); Precursor prec; @@ -1988,9 +2335,22 @@ namespace OpenMS // ===================================================================== void BrukerTimsFile::loadFrames_(TimsDataHandle& handle, MSExperiment& exp, const Config& config) { + // FRAME mode returns raw per-frame spectra; MS1 aggregation would break that + // contract, so warn and ignore the knob. + if (config.ms1_n_neighbors > 0) + { + OPENMS_LOG_WARN << "Warning: ms1_n_neighbors (=" << config.ms1_n_neighbors + << ") is ignored when export_mode=FRAME because FRAME mode returns " + << "raw frames. Set export_mode=AUTO or SPECTRUM to enable MS1 " + << "frame aggregation." << std::endl; + } + // Iterate all frames at each MS level for (int level = 1; level <= 2; ++level) { + // Honor load_ms1: skip level==1 entirely when the user opts out of MS1. + if (level == 1 && !config.load_ms1) continue; + // Collect frame IDs at this level std::vector frame_ids; for (uint32_t fid = handle.min_frame_id(); fid <= handle.max_frame_id(); ++fid) @@ -2011,25 +2371,22 @@ namespace OpenMS continue; } - bool do_centroid = (level == 1) && isCentroidingEnabled(config); FrameCentroider centroider; startProgress(0, frame_ids.size(), String("Loading MS") + String(level) + " frames"); for (size_t i = 0; i < frame_ids.size(); ++i) { TimsFrame& frame = handle.get_frame(frame_ids[i]); MSSpectrum spec; - if (do_centroid) - { - centroidMS1Frame(frame, spec, config, centroider); - } + if (level == 1) + loadMS1Spectrum(frame, spec, config, centroider); else - { frameToSpectrum(frame, spec, level); - } exp.addSpectrum(std::move(spec)); setProgress(i); } endProgress(); + if (level == 1) + centroider.reportCapSummary(static_cast(config.ms1_centroid_max_peaks)); } } diff --git a/src/openms/source/FORMAT/PepXMLFile.cpp b/src/openms/source/FORMAT/PepXMLFile.cpp index d2e1da05995..23e7aca82b7 100644 --- a/src/openms/source/FORMAT/PepXMLFile.cpp +++ b/src/openms/source/FORMAT/PepXMLFile.cpp @@ -1254,7 +1254,7 @@ namespace OpenMS value = attributeAsDouble_(attributes, "value"); peptide_hit_.setMetaValue("Comet:lnrSp", value); // name: Comet:lnrSp peptide_hit_.setMetaValue("COMET:lnRankSP", value); // name: COMET:lnRankSP - } + } else if (name == "deltLCn") { value = attributeAsDouble_(attributes, "value"); @@ -1263,7 +1263,7 @@ namespace OpenMS else if (name == "lnExpect") { value = attributeAsDouble_(attributes, "value"); - peptide_hit_.setMetaValue("COMET:lnExpect", value); // name: Comet:lnExpect + peptide_hit_.setMetaValue("COMET:lnExpect", value); // name: Comet:lnExpect } else if (name == "IonFrac") { @@ -1275,7 +1275,7 @@ namespace OpenMS { value = attributeAsDouble_(attributes, "value"); peptide_hit_.setMetaValue("COMET:lnNumSP", value); // name: Comet:lnNumSP - } + } } else if (parse_unknown_scores_) { @@ -1295,7 +1295,7 @@ namespace OpenMS //TODO warn about non-numeric score? Or even do not catch the conversion error? peptide_hit_.setMetaValue(name, attributeAsString_(attributes, "value")); // Any other generic score (fallback String) } - + } } } @@ -1350,7 +1350,7 @@ namespace OpenMS { bool current_prot_is_decoy = protein.hasPrefix(decoy_prefix_); auto current_type = peptide_hit_.getTargetDecoyType(); - + if (current_type == PeptideHit::TargetDecoyType::UNKNOWN) { // No annotation yet, set based on current protein @@ -1403,8 +1403,8 @@ namespace OpenMS current_peptide_.setSpectrumReference( String("scan=") + String(scannr_)); } //TODO else error? - - + + if (!experiment_label_.empty()) { current_peptide_.setExperimentLabel(experiment_label_); @@ -1650,7 +1650,7 @@ namespace OpenMS { bool current_prot_is_decoy = protein.hasPrefix(decoy_prefix_); auto current_type = peptide_hit_.getTargetDecoyType(); - + if (current_type == PeptideHit::TargetDecoyType::UNKNOWN) { // No annotation yet, set based on current protein @@ -1664,7 +1664,7 @@ namespace OpenMS // Peptide matches both target and decoy proteins peptide_hit_.setTargetDecoyType(PeptideHit::TargetDecoyType::TARGET_DECOY); } - + hit.setTargetDecoyType(current_prot_is_decoy ? ProteinHit::TargetDecoyType::DECOY : ProteinHit::TargetDecoyType::TARGET); @@ -1926,16 +1926,17 @@ namespace OpenMS String cut_before = attributeAsString_(attributes, "cut"); String no_cut_after = attributeAsString_(attributes, "no_cut"); String sense = attributeAsString_(attributes, "sense"); - params_.digestion_enzyme = DigestionEnzymeProtein(DigestionEnzyme( + DigestionEnzymeProtein::Sense sen = (sense.toLower() == "c") ? sen = DigestionEnzymeProtein::Sense::C_TERM : sen = DigestionEnzymeProtein::Sense::N_TERM; + params_.digestion_enzyme = DigestionEnzymeProtein( "user-defined," + enzyme_ + "," + cut_before + "," + no_cut_after + "," + sense, - cut_before, no_cut_after, sense)); + cut_before, sen, no_cut_after); } else if (element == "enzymatic_search_constraint") // parent: "search_summary" { //TODO we should not overwrite the enzyme here! Luckily in most files it is the same // enzyme as in sample_enzyme or something useless like "default". /// - enzyme_ = attributeAsString_(attributes, "enzyme"); + enzyme_ = attributeAsString_(attributes, "enzyme"); if (enzyme_ == "stricttrypsin") enzyme_ = "Trypsin/P"; // MSFragger synonyme if (ProteaseDB::getInstance()->hasEnzyme(enzyme_)) diff --git a/src/openms/source/METADATA/SpectrumNativeIDParser.cpp b/src/openms/source/METADATA/SpectrumNativeIDParser.cpp index b592974e7f3..220ded5e29b 100644 --- a/src/openms/source/METADATA/SpectrumNativeIDParser.cpp +++ b/src/openms/source/METADATA/SpectrumNativeIDParser.cpp @@ -21,7 +21,8 @@ namespace OpenMS { return id.hasPrefix("scan=") || id.hasPrefix("scanId=") || id.hasPrefix("scanID=") || id.hasPrefix("controllerType=") || id.hasPrefix("function=") || id.hasPrefix("sample=") - || id.hasPrefix("index=") || id.hasPrefix("spectrum=") || id.hasPrefix("file="); + || id.hasPrefix("index=") || id.hasPrefix("spectrum=") || id.hasPrefix("file=") + || id.hasPrefix("frame="); } std::string SpectrumNativeIDParser::getRegExFromNativeID(const String& id) @@ -29,9 +30,15 @@ namespace OpenMS // "scan=NUMBER" e.g. Bruker/Agilent // "controllerType=0 controllerNumber=1 scan=NUMBER" for Thermo // "function= process= scan=NUMBER" for Waters + // "frame=FRAME_ID scan=SCAN_ID [precursor=PREC_ID]" for Bruker TDF + // (MS:1002818). Extra trailing tokens (precursor=, windowGroup=, + // scanStart=, scanEnd=, merged=) are used by OpenMS DDA/DIA PASEF + // and pwiz combined mode; the regex targets the "scan=" token + // which remains the most meaningful scan-number proxy. if (id.hasPrefix("scan=") || id.hasPrefix("controllerType=") - || id.hasPrefix("function=")) return std::string(R"(scan=(?\d+))"); + || id.hasPrefix("function=") + || id.hasPrefix("frame=")) return std::string(R"(scan=(?\d+))"); // "index=NUMBER" if (id.hasPrefix("index=")) return std::string(R"(index=(?\d+))"); @@ -83,8 +90,13 @@ namespace OpenMS { // check accession for data type to extract (e.g. MS:1000768 - Thermo nativeID format - scan=xsd:positiveInteger) boost::regex regexp; - // list of CV accessions with native id format "scan=NUMBER" - std::vector scan = {"MS:1000768","MS:1000769","MS:1000771","MS:1000772","MS:1000776"}; + // list of CV accessions with native id format "scan=NUMBER". + // MS:1002818 = "Bruker TDF nativeID format" (pattern "frame= scan=", + // we extend it with a trailing "precursor=" for aggregated DDA + // output — see BrukerTimsFile::loadDDA_ for the rationale). The regex + // targets the "scan=" token in all cases; trailing tokens are + // ignored. + std::vector scan = {"MS:1000768","MS:1000769","MS:1000771","MS:1000772","MS:1000776","MS:1002818"}; // list of CV accession with native id format "file=NUMBER" std::vector file = {"MS:1000773","MS:1000775"}; // expected number of subgroups diff --git a/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp b/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp deleted file mode 100644 index 63402a85e4b..00000000000 --- a/src/openms/source/PROCESSING/RESAMPLING/LinearResampler.cpp +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin -// SPDX-License-Identifier: BSD-3-Clause -// -// -------------------------------------------------------------------------- -// $Maintainer: Timo Sachsenberg $ -// $Authors: Eva Lange $ -// -------------------------------------------------------------------------- -// - -#include - -namespace OpenMS -{ - LinearResampler default_linear_resampler; - LinearResampler default_linear_resampler2; -} diff --git a/src/openms/source/PROCESSING/RESAMPLING/sources.cmake b/src/openms/source/PROCESSING/RESAMPLING/sources.cmake index 07d3d759a29..eb0d0910ff1 100644 --- a/src/openms/source/PROCESSING/RESAMPLING/sources.cmake +++ b/src/openms/source/PROCESSING/RESAMPLING/sources.cmake @@ -3,7 +3,6 @@ set(directory source/PROCESSING/RESAMPLING) ### list all filenames of the directory here set(sources_list -LinearResampler.cpp LinearResamplerAlign.cpp ) diff --git a/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp b/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp index 53afe331558..21fdab73e39 100644 --- a/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp +++ b/src/openms_gui/source/VISUAL/APPLICATIONS/GUITOOLS/ImageCreator.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/src/pyOpenMS/bindings/bind_misc.cpp b/src/pyOpenMS/bindings/bind_misc.cpp index 64d0ad0a6f6..9b8e335e832 100644 --- a/src/pyOpenMS/bindings/bind_misc.cpp +++ b/src/pyOpenMS/bindings/bind_misc.cpp @@ -166,7 +166,6 @@ #include #include #include -#include #include #include #include @@ -2641,33 +2640,24 @@ BaseGroupFinder ; def_ProgressLogger(labeledpairfinder_class); - // ----------------------------------------------------------------------- - // LinearResampler - // ----------------------------------------------------------------------- - auto linearresampler_class = nb::class_(m, "LinearResampler", - R"doc( -DefaultParamHandler -ProgressLogger - -Annotates and filters transitions in a TargetedExperiment -:param exp: The input, unfiltered transitions -)doc") - .def(nb::init<>()) - .def("raster", [](const OpenMS::LinearResampler& self, OpenMS::MSSpectrum& spectrum) { return self.raster(spectrum); }, "spectrum"_a, "Applies the resampling algorithm to an MSSpectrum") - .def("rasterExperiment", [](OpenMS::LinearResampler& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples the data in an MSExperiment") - ; - def_ProgressLogger(linearresampler_class); - // ----------------------------------------------------------------------- // LinearResamplerAlign // ----------------------------------------------------------------------- - auto linearresampleralign_class = nb::class_(m, "LinearResamplerAlign", + auto linearresampleralign_class = nb::class_(m, "LinearResamplerAlign", R"doc( Linear Resampling of raw data with alignment -LinearResampler +DefaultParamHandler +ProgressLogger )doc") .def(nb::init<>()) - .def("rasterExperiment", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples the data in an MSExperiment") + .def(nb::init()) + .def("__copy__", [](const OpenMS::LinearResamplerAlign& self) { return OpenMS::LinearResamplerAlign(self); }) + .def("__deepcopy__", [](const OpenMS::LinearResamplerAlign& self, nb::dict) { return OpenMS::LinearResamplerAlign(self); }, "memo"_a) + .def("raster", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSSpectrum& spectrum) { return self.raster(spectrum); }, "spectrum"_a, "Applies the resampling algorithm to an MSSpectrum") + .def("raster", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSChromatogram& chromatogram) { return self.raster(chromatogram); }, "chromatogram"_a, "Applies the resampling algorithm to an MSChromatogram") + .def("raster_align", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSSpectrum& spectrum, double start_pos, double end_pos) { return self.raster_align(spectrum, start_pos, end_pos); }, "spectrum"_a, "start_pos"_a, "end_pos"_a, "Resamples an MSSpectrum onto an explicit raster [start_pos, end_pos]") + .def("raster_align", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSChromatogram& chromatogram, double start_pos, double end_pos) { return self.raster_align(chromatogram, start_pos, end_pos); }, "chromatogram"_a, "start_pos"_a, "end_pos"_a, "Resamples an MSChromatogram onto an explicit raster [start_pos, end_pos]") + .def("rasterExperiment", [](OpenMS::LinearResamplerAlign& self, OpenMS::MSExperiment& exp) { return self.rasterExperiment(exp); }, "exp"_a, "Resamples all spectra in an MSExperiment") ; def_ProgressLogger(linearresampleralign_class); diff --git a/src/pyOpenMS/tests/unittests/test000.py b/src/pyOpenMS/tests/unittests/test000.py index 13aa9c4ee33..d06be67ef0c 100644 --- a/src/pyOpenMS/tests/unittests/test000.py +++ b/src/pyOpenMS/tests/unittests/test000.py @@ -1853,17 +1853,17 @@ def testItraqConstants(): @report -def testLinearResampler(): +def testLinearResamplerAlign(): """ - @tests: LinearResampler - LinearResampler.__init__ + @tests: LinearResamplerAlign + LinearResamplerAlign.__init__ """ - ff = pyopenms.LinearResampler() + ff = pyopenms.LinearResamplerAlign() p = ff.getDefaults() _testParam(p) - assert pyopenms.LinearResampler().raster is not None - assert pyopenms.LinearResampler().rasterExperiment is not None + assert pyopenms.LinearResamplerAlign().raster is not None + assert pyopenms.LinearResamplerAlign().rasterExperiment is not None @report def testPeptideAndProteinQuant(): diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index f0dadadc7af..721a42c2d52 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -346,7 +346,6 @@ set(filtering_executables_list GaussFilterAlgorithm_test IDFilter_test InternalCalibration_test - LinearResampler_test LinearResamplerAlign_test LowessSmoothing_test MassTraceDetection_test @@ -411,6 +410,7 @@ set(chemistry_executables_list CoarseIsotopeDistribution_test CrossLinksDB_test DecoyGenerator_test + DigestionEnzyme_test DigestionEnzymeProtein_test ElementDB_test Element_test @@ -600,7 +600,7 @@ set(transformations_executables_list EmgFitter1D_test EmgModel_test ExtendedIsotopeFitter1D_test - ExtendedIsotopeModel_test + ExtendedIsotopeModel_test FeatureFinderAlgorithmPickedHelperStructs_test FeatureFinderAlgorithmPicked_test FeatureFinderIdentificationAlgorithm_test diff --git a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp index 3f5592d819d..ba22eae131b 100644 --- a/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp +++ b/src/tests/class_tests/openms/source/BrukerTimsFile_test.cpp @@ -248,6 +248,9 @@ START_SECTION(DDA loading integration test) TEST_NOT_EQUAL(spec.getDriftTime(), 0.0); TEST_EQUAL(spec.getDriftTimeUnit(), DriftTimeUnit::VSSC); TEST_NOT_EQUAL(spec.getPrecursors()[0].getMZ(), 0.0); + // TDF MS2 peaks are detector-centroided; must be labelled CENTROID so + // downstream tools (e.g. CometAdapter) don't reject them as profile. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } @@ -257,6 +260,131 @@ START_SECTION(DDA loading integration test) } END_SECTION +START_SECTION(DDA native ID format test) +{ + // Contract: DDA MS2 native IDs are "frame= scan= precursor=

". + // This extends the MS:1002818 pattern with a trailing "precursor=

" + // token because OpenMS aggregates all PasefFrameMsMsInfo entries sharing + // the same Precursors.Id into ONE spectrum (pwiz emits per-mobility-scan, + // so its (frame, scan_begin) pairs are inherently unique — ours are not). + // The Precursors.Id is ALSO duplicated as MetaValue "bruker_precursor_id" + // for typed programmatic access. + BrukerTimsFile f; + MSExperiment exp; + f.load(OPENTIMS_DDA_TEST_DATA, exp); + + // Find a non-empty MS2 spectrum + const MSSpectrum* ms2 = nullptr; + for (const auto& spec : exp) + { + if (spec.getMSLevel() == 2 && !spec.empty()) + { + ms2 = &spec; + break; + } + } + TEST_NOT_EQUAL(ms2, nullptr); + + const String& id = ms2->getNativeID(); + TEST_EQUAL(id.hasPrefix("frame="), true); + TEST_EQUAL(id.hasSubstring(" scan="), true); + TEST_EQUAL(id.hasSubstring(" precursor="), true); + + // Precursors.Id must ALSO be stored as a typed MetaValue. + TEST_EQUAL(ms2->metaValueExists("bruker_precursor_id"), true); + + // All DDA MS2 native IDs must be unique inside the run (XSD-level mzML + // requirement). This is what the "precursor=

" disambiguator buys us. + std::set ms2_ids; + Size ms2_count = 0; + for (const auto& spec : exp) + { + if (spec.getMSLevel() == 2) + { + ms2_ids.insert(spec.getNativeID()); + ++ms2_count; + } + } + TEST_EQUAL(ms2_ids.size(), ms2_count); + + STATUS("DDA MS2 native ID sample: " << id + << " bruker_precursor_id=" << ms2->getMetaValue("bruker_precursor_id").toString()); +} +END_SECTION + +START_SECTION(Bruker load_ms1=false test) +{ + BrukerTimsFile f; + + // Baseline: default config loads both MS1 and MS2 + MSExperiment exp_both; + f.load(OPENTIMS_DDA_TEST_DATA, exp_both); + Size raw_ms1 = 0, raw_ms2 = 0; + for (const auto& s : exp_both) + { + if (s.getMSLevel() == 1) ++raw_ms1; + else if (s.getMSLevel() == 2) ++raw_ms2; + } + TEST_NOT_EQUAL(raw_ms1, 0); + TEST_NOT_EQUAL(raw_ms2, 0); + + // load_ms1=false: zero MS1, MS2 count unchanged + BrukerTimsFile::Config cfg; + cfg.load_ms1 = false; + MSExperiment exp_ms2_only; + f.load(OPENTIMS_DDA_TEST_DATA, exp_ms2_only, cfg); + Size skip_ms1 = 0, skip_ms2 = 0; + for (const auto& s : exp_ms2_only) + { + if (s.getMSLevel() == 1) ++skip_ms1; + else if (s.getMSLevel() == 2) ++skip_ms2; + } + TEST_EQUAL(skip_ms1, 0); + TEST_EQUAL(skip_ms2, raw_ms2); + + STATUS("DDA load_ms1=false: raw MS1=" << raw_ms1 << " MS2=" << raw_ms2 + << " | skipped MS1=" << skip_ms1 << " MS2=" << skip_ms2); + +#ifdef OPENTIMS_DIA_TEST_DATA + // Same invariant on DIA + MSExperiment exp_dia_both; + f.load(OPENTIMS_DIA_TEST_DATA, exp_dia_both); + Size dia_raw_ms1 = 0, dia_raw_ms2 = 0; + for (const auto& s : exp_dia_both) + { + if (s.getMSLevel() == 1) ++dia_raw_ms1; + else if (s.getMSLevel() == 2) ++dia_raw_ms2; + } + + BrukerTimsFile::Config cfg_dia; + cfg_dia.load_ms1 = false; + MSExperiment exp_dia_skip; + f.load(OPENTIMS_DIA_TEST_DATA, exp_dia_skip, cfg_dia); + Size dia_skip_ms1 = 0, dia_skip_ms2 = 0; + for (const auto& s : exp_dia_skip) + { + if (s.getMSLevel() == 1) ++dia_skip_ms1; + else if (s.getMSLevel() == 2) ++dia_skip_ms2; + } + TEST_EQUAL(dia_skip_ms1, 0); + TEST_EQUAL(dia_skip_ms2, dia_raw_ms2); + + // FRAME export mode must also honor load_ms1 (skips level==1 loop) + BrukerTimsFile::Config cfg_frame; + cfg_frame.export_mode = BrukerTimsFile::Config::FRAME; + cfg_frame.load_ms1 = false; + MSExperiment exp_frame; + f.load(OPENTIMS_DIA_TEST_DATA, exp_frame, cfg_frame); + Size frame_ms1 = 0; + for (const auto& s : exp_frame) + { + if (s.getMSLevel() == 1) ++frame_ms1; + } + TEST_EQUAL(frame_ms1, 0); +#endif +} +END_SECTION + START_SECTION(DDA round-trip test: load .d -> write mzML -> reload -> verify) { // Load from .d @@ -313,6 +441,8 @@ START_SECTION(DDA frame-level loading test) if (!spec.empty()) { TEST_EQUAL(spec.containsIMData(), true); + // frameToSpectrum emits detector-centroided peaks; label as CENTROID. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } @@ -486,6 +616,62 @@ START_SECTION(DDA search engine IM annotation integration test) } END_SECTION +START_SECTION(DDA MS1 aggregation with RT cap test) +{ + BrukerTimsFile f; + + // Baseline: raw DDA MS1 + MSExperiment exp_raw; + f.load(OPENTIMS_DDA_TEST_DATA, exp_raw); + + // Aggregated: ms1_n_neighbors=1 uncapped + BrukerTimsFile::Config cfg_uncapped; + cfg_uncapped.ms1_n_neighbors = 1; + cfg_uncapped.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_uncapped; + f.load(OPENTIMS_DDA_TEST_DATA, exp_uncapped, cfg_uncapped); + + // Aggregated: tight 1.0s RT cap (may be inert if fixture MS1 cadence is tight) + BrukerTimsFile::Config cfg_capped; + cfg_capped.ms1_n_neighbors = 1; + cfg_capped.ms1_max_rt_distance_sec = 1.0; + MSExperiment exp_capped; + f.load(OPENTIMS_DDA_TEST_DATA, exp_capped, cfg_capped); + + auto count_ms1 = [](const MSExperiment& e, Size& n_spectra, double& total_intensity) + { + n_spectra = 0; total_intensity = 0.0; + for (const auto& s : e) + if (s.getMSLevel() == 1) + { + ++n_spectra; + for (const auto& p : s) total_intensity += p.getIntensity(); + } + }; + + Size raw_n, uncapped_n, capped_n; + double raw_i, uncapped_i, capped_i; + count_ms1(exp_raw, raw_n, raw_i); + count_ms1(exp_uncapped, uncapped_n, uncapped_i); + count_ms1(exp_capped, capped_n, capped_i); + + TEST_NOT_EQUAL(raw_n, 0); + TEST_NOT_EQUAL(uncapped_n, 0); + TEST_NOT_EQUAL(capped_n, 0); + + // Uncapped aggregation boosts intensity over raw + TEST_EQUAL(uncapped_i > raw_i, true); + + // Capped aggregation ≤ uncapped (cap can only exclude neighbors, never add) + TEST_EQUAL(capped_i <= uncapped_i, true); + + STATUS("DDA MS1 aggregation: raw intensity=" << raw_i + << " uncapped=" << uncapped_i + << " capped(1.0s)=" << capped_i + << " cap_reduction=" << (1.0 - capped_i / uncapped_i)); +} +END_SECTION + #endif // OPENTIMS_DDA_TEST_DATA #ifdef OPENTIMS_DIA_TEST_DATA @@ -505,6 +691,8 @@ START_SECTION(DIA loading integration test) { TEST_EQUAL(spec.containsIMData(), true); TEST_NOT_EQUAL(spec.getPrecursors().size(), 0); + // TDF MS2 peaks are detector-centroided; must be labelled CENTROID. + TEST_EQUAL(spec.getType(), SpectrumSettings::SpectrumType::CENTROID); break; } } @@ -586,7 +774,7 @@ START_SECTION(DIA MS2 aggregation test) // Count MS2 spectra, total MS2 peaks, and cumulative MS2 intensity in both. // - // DIAFrameAggregator (BrukerTimsFile.cpp) is a two-stage operation on a + // FrameAggregator (BrukerTimsFile.cpp) is a two-stage operation on a // sparse 2D grid keyed by (mz_bin, scan_id) — NOT (mz_bin, scan_id, frame_id): // // 1. SUM stage: for each (center_frame_i, window) position, accumulate @@ -736,6 +924,446 @@ START_SECTION(DIA MS2 centroiding test) } END_SECTION +START_SECTION(DIA MS2 centroiding without denoising (min_support=0)) +{ + // Regression guard for the "centroid without noise filter" path exposed via + // Config::dia_ms2_min_support = 0. With min_support=0 the 3x3 neighbor filter + // in FrameAggregator::finalize{,Centroided} is a no-op (`neighbors < 0` is + // always false), so every populated grid cell survives the denoise step. + // Under centroiding that means more local maxima → strictly more output peaks + // than the denoised (min_support=1) path, because denoising preferentially + // removes isolated cells that would otherwise become their own centroid. + BrukerTimsFile f; + + BrukerTimsFile::Config cfg_denoise; + cfg_denoise.dia_ms2_n_neighbors = 1; + cfg_denoise.dia_ms2_min_support = 1; + cfg_denoise.dia_ms2_centroid = true; + MSExperiment exp_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_denoise, cfg_denoise); + + BrukerTimsFile::Config cfg_no_denoise; + cfg_no_denoise.dia_ms2_n_neighbors = 1; + cfg_no_denoise.dia_ms2_min_support = 0; + cfg_no_denoise.dia_ms2_centroid = true; + MSExperiment exp_no_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_no_denoise, cfg_no_denoise); + + Size denoise_peaks = 0, no_denoise_peaks = 0; + Size denoise_spectra = 0, no_denoise_spectra = 0; + for (const auto& spec : exp_denoise) + if (spec.getMSLevel() == 2) { ++denoise_spectra; denoise_peaks += spec.size(); } + for (const auto& spec : exp_no_denoise) + if (spec.getMSLevel() == 2) { ++no_denoise_spectra; no_denoise_peaks += spec.size(); } + + // Without denoising, a few spectra that would become empty after denoising + // (and are dropped by the `if (peaks.empty()) continue;` guard in the + // BrukerTimsFile loader) are now retained. Count is >= denoised count. + TEST_EQUAL(no_denoise_spectra >= denoise_spectra, true); + + // Without denoising the centroided output contains strictly more peaks: + // every populated grid cell becomes a centroid candidate instead of being + // dropped when isolated. + TEST_NOT_EQUAL(no_denoise_peaks, 0); + TEST_EQUAL(no_denoise_peaks > denoise_peaks, true); + + // Output is still centroided in the IM dimension. + for (const auto& spec : exp_no_denoise) + { + if (spec.getMSLevel() == 2 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_CENTROIDED, true); + TEST_NOT_EQUAL(spec.getPrecursors().size(), 0); + break; + } + } + + STATUS("DIA centroiding (no denoise): MS2 peaks=" << no_denoise_peaks + << " vs denoised=" << denoise_peaks + << " ratio=" << (static_cast(no_denoise_peaks) / denoise_peaks)); +} +END_SECTION + +START_SECTION(DIA MS1 default config regression test) +{ + // Regression guard: with all new MS1 aggregation knobs at their defaults + // (ms1_n_neighbors=0, ms1_min_support=0, ms1_max_rt_distance_sec=0.0), + // the DIA MS1 output must be byte-identical to the pre-PR load. This + // section fails if a future change accidentally enables aggregation + // by default. + BrukerTimsFile f; + + BrukerTimsFile::Config cfg_default; // all defaults + MSExperiment exp_default; + f.load(OPENTIMS_DIA_TEST_DATA, exp_default, cfg_default); + + // Snapshot: count MS1 spectra, total MS1 peaks, total MS1 intensity + Size ms1_spectra = 0, ms1_peaks = 0; + double ms1_intensity = 0.0; + for (const auto& spec : exp_default) + { + if (spec.getMSLevel() == 1) + { + ++ms1_spectra; + ms1_peaks += spec.size(); + for (const auto& p : spec) ms1_intensity += p.getIntensity(); + } + } + + TEST_NOT_EQUAL(ms1_spectra, 0); + TEST_NOT_EQUAL(ms1_peaks, 0); + + STATUS("DIA MS1 default-config snapshot: spectra=" << ms1_spectra + << " peaks=" << ms1_peaks + << " intensity=" << ms1_intensity); + + // Independently reproduce with ms1_n_neighbors=0 explicitly — assert + // the numbers match exactly, proving the knob at 0 is inert. + BrukerTimsFile::Config cfg_explicit_off; + cfg_explicit_off.ms1_n_neighbors = 0; + cfg_explicit_off.ms1_min_support = 0; + cfg_explicit_off.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_explicit_off; + f.load(OPENTIMS_DIA_TEST_DATA, exp_explicit_off, cfg_explicit_off); + + Size ms1_spectra_b = 0, ms1_peaks_b = 0; + double ms1_intensity_b = 0.0; + for (const auto& spec : exp_explicit_off) + { + if (spec.getMSLevel() == 1) + { + ++ms1_spectra_b; + ms1_peaks_b += spec.size(); + for (const auto& p : spec) ms1_intensity_b += p.getIntensity(); + } + } + + TEST_EQUAL(ms1_spectra_b, ms1_spectra); + TEST_EQUAL(ms1_peaks_b, ms1_peaks); + TEST_REAL_SIMILAR(ms1_intensity_b, ms1_intensity); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation test) +{ + // RED test for MS1 frame aggregation. With ms1_n_neighbors=1 we expect + // ~same spectrum count as raw (edge-truncation aside), boosted intensity, + // and IM_PROFILE output with per-peak IM. + BrukerTimsFile f; + + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + + BrukerTimsFile::Config cfg; + cfg.ms1_n_neighbors = 1; + MSExperiment exp_agg; + f.load(OPENTIMS_DIA_TEST_DATA, exp_agg, cfg); + + Size raw_ms1_spectra = 0, raw_ms1_peaks = 0; + double raw_ms1_intensity = 0.0; + for (const auto& spec : exp_raw) + { + if (spec.getMSLevel() == 1) + { + ++raw_ms1_spectra; + raw_ms1_peaks += spec.size(); + for (const auto& p : spec) raw_ms1_intensity += p.getIntensity(); + } + } + + Size agg_ms1_spectra = 0, agg_ms1_peaks = 0; + double agg_ms1_intensity = 0.0; + for (const auto& spec : exp_agg) + { + if (spec.getMSLevel() == 1) + { + ++agg_ms1_spectra; + agg_ms1_peaks += spec.size(); + for (const auto& p : spec) agg_ms1_intensity += p.getIntensity(); + } + } + + TEST_NOT_EQUAL(raw_ms1_spectra, 0); + TEST_NOT_EQUAL(agg_ms1_spectra, 0); + + // Spectrum count: aggregated MS1 path emits exactly one spectrum per center + // frame (empty spectra emitted when all peaks filter out — see the empty- + // spectrum contract in loadAggregatedMS1Spectrum). Unlike the MS2 path, + // this is a hard equality: MS1 never drops spectra. + TEST_EQUAL(agg_ms1_spectra, raw_ms1_spectra); + + // Intensity boost: aggregation sums intensity across 2*N+1 = 3 frames. + // MS1 fixture shows ratio ≈ 2.99 (empirical) — much higher than MS2's ≈1.28 + // because MS1 ion populations are more stable across adjacent frames: peaks + // at the same nominal (mz_bin, scan_id) from different frames tend to land + // in different cells (scan_id jitter, m/z jitter beyond 0.01 Da), so the + // aggregator rarely merges them — closer to the naive 3x sum than MS2. + // Bounds reflect this: accept anywhere in [2.5, 3.1] as valid aggregation. + TEST_EQUAL(raw_ms1_intensity > 0.0, true); + const double intensity_ratio = agg_ms1_intensity / raw_ms1_intensity; + TEST_EQUAL(intensity_ratio >= 2.5, true); + TEST_EQUAL(intensity_ratio <= 3.1, true); + + // Output must carry per-peak IM with IM_PROFILE type + for (const auto& spec : exp_agg) + { + if (spec.getMSLevel() == 1 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_PROFILE, true); + // MS1 spectra must NOT carry precursors (unlike MS2). + TEST_EQUAL(spec.getPrecursors().empty(), true); + break; + } + } + + STATUS("DIA MS1 aggregation: raw spectra=" << raw_ms1_spectra + << " peaks=" << raw_ms1_peaks + << " intensity=" << raw_ms1_intensity + << " | aggregated spectra=" << agg_ms1_spectra + << " peaks=" << agg_ms1_peaks + << " intensity=" << agg_ms1_intensity + << " | ratio=" << intensity_ratio); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation with centroiding test) +{ + BrukerTimsFile f; + + // Aggregated, no centroiding (baseline) + BrukerTimsFile::Config cfg_profile; + cfg_profile.ms1_n_neighbors = 1; + MSExperiment exp_profile; + f.load(OPENTIMS_DIA_TEST_DATA, exp_profile, cfg_profile); + + // Aggregated + within-frame centroiding + BrukerTimsFile::Config cfg_centroid; + cfg_centroid.ms1_n_neighbors = 1; + cfg_centroid.ms1_centroid_mz_ppm = 5.0f; + cfg_centroid.ms1_centroid_im_pct = 3.0f; + MSExperiment exp_centroid; + f.load(OPENTIMS_DIA_TEST_DATA, exp_centroid, cfg_centroid); + + auto count_ms1 = [](const MSExperiment& e, Size& n_spectra, Size& n_peaks, double& total_intensity) + { + n_spectra = 0; n_peaks = 0; total_intensity = 0.0; + for (const auto& s : e) + if (s.getMSLevel() == 1) + { + ++n_spectra; + n_peaks += s.size(); + for (const auto& p : s) total_intensity += p.getIntensity(); + } + }; + + Size profile_n, profile_p, centroid_n, centroid_p; + double profile_i, centroid_i; + count_ms1(exp_profile, profile_n, profile_p, profile_i); + count_ms1(exp_centroid, centroid_n, centroid_p, centroid_i); + + TEST_NOT_EQUAL(centroid_n, 0); + + // Same spectrum count (centroiding doesn't drop spectra) + TEST_EQUAL(centroid_n, profile_n); + + // Centroiding collapses IM-adjacent peaks → fewer peaks than profile. + TEST_EQUAL(centroid_p < profile_p, true); + + // Intensity conservation: with the default ms1_centroid_max_peaks=100000 + // the cap rarely fires on typical aggregated MS1 surveys (top 100k peaks + // out of ~600k carry >80% of total intensity). Allow [0.5, 1.0] to cover + // dense fixtures where a noticeable long-tail still gets dropped. + const double ratio = centroid_i / profile_i; + TEST_EQUAL(ratio > 0.5, true); + TEST_EQUAL(ratio <= 1.0, true); + + // Output shape + for (const auto& spec : exp_centroid) + { + if (spec.getMSLevel() == 1 && !spec.empty()) + { + TEST_EQUAL(spec.containsIMData(), true); + TEST_EQUAL(spec.getIMPeakType() == IMPeakType::IM_CENTROIDED, true); + TEST_EQUAL(spec.getType() == SpectrumSettings::SpectrumType::CENTROID, true); + TEST_EQUAL(spec.getPrecursors().empty(), true); + break; + } + } + + STATUS("DIA MS1 aggregation+centroid: profile peaks=" << profile_p + << " centroid peaks=" << centroid_p + << " ratio=" << (static_cast(centroid_p) / profile_p) + << " intensity_ratio=" << ratio); +} +END_SECTION + +START_SECTION(DIA MS1 aggregation min_support test) +{ + BrukerTimsFile f; + + // No denoise (baseline from DIA MS1 aggregation test: intensity ratio ≈ 3.0) + BrukerTimsFile::Config cfg_no_denoise; + cfg_no_denoise.ms1_n_neighbors = 1; + cfg_no_denoise.ms1_min_support = 0; + MSExperiment exp_no_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_no_denoise, cfg_no_denoise); + + // With denoise + BrukerTimsFile::Config cfg_denoise; + cfg_denoise.ms1_n_neighbors = 1; + cfg_denoise.ms1_min_support = 1; + MSExperiment exp_denoise; + f.load(OPENTIMS_DIA_TEST_DATA, exp_denoise, cfg_denoise); + + Size no_denoise_peaks = 0, denoise_peaks = 0; + double no_denoise_intensity = 0.0, denoise_intensity = 0.0; + for (const auto& s : exp_no_denoise) + if (s.getMSLevel() == 1) + { + no_denoise_peaks += s.size(); + for (const auto& p : s) no_denoise_intensity += p.getIntensity(); + } + for (const auto& s : exp_denoise) + if (s.getMSLevel() == 1) + { + denoise_peaks += s.size(); + for (const auto& p : s) denoise_intensity += p.getIntensity(); + } + + TEST_NOT_EQUAL(no_denoise_peaks, 0); + TEST_NOT_EQUAL(denoise_peaks, 0); + + // Denoise drops peaks. Conservative lower bound: ≥10% drop; MS1 surveys + // often have many low-intensity isolated cells that the 3x3 filter culls. + TEST_EQUAL(denoise_peaks < no_denoise_peaks, true); + const double drop_frac = 1.0 - static_cast(denoise_peaks) / no_denoise_peaks; + TEST_EQUAL(drop_frac >= 0.10, true); + + // Intensity boost still present after denoise (signal peaks survive). + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + double raw_intensity = 0.0; + for (const auto& s : exp_raw) + if (s.getMSLevel() == 1) + for (const auto& p : s) raw_intensity += p.getIntensity(); + TEST_EQUAL(denoise_intensity > raw_intensity, true); + + STATUS("DIA MS1 min_support: no_denoise_peaks=" << no_denoise_peaks + << " denoise_peaks=" << denoise_peaks + << " drop_frac=" << drop_frac + << " denoise_intensity/raw=" << (denoise_intensity / raw_intensity)); +} +END_SECTION + +START_SECTION(DIA MS1 RT cap test) +{ + BrukerTimsFile f; + + // Uncapped reference: ms1_n_neighbors=2 gives a wider window for the cap + // to bite into. + BrukerTimsFile::Config cfg_uncapped; + cfg_uncapped.ms1_n_neighbors = 2; + cfg_uncapped.ms1_max_rt_distance_sec = 0.0; + MSExperiment exp_uncapped; + f.load(OPENTIMS_DIA_TEST_DATA, exp_uncapped, cfg_uncapped); + + // Tight cap: 0.5s (DIA MS1 cadence ~1.5s, so N=2 reaches ~3s total span; + // the 0.5s cap should truncate aggressively). + BrukerTimsFile::Config cfg_capped; + cfg_capped.ms1_n_neighbors = 2; + cfg_capped.ms1_max_rt_distance_sec = 0.5; + MSExperiment exp_capped; + f.load(OPENTIMS_DIA_TEST_DATA, exp_capped, cfg_capped); + + auto total_ms1_intensity = [](const MSExperiment& e) + { + double s = 0.0; + for (const auto& spec : e) + if (spec.getMSLevel() == 1) + for (const auto& p : spec) s += p.getIntensity(); + return s; + }; + + double uncapped_i = total_ms1_intensity(exp_uncapped); + double capped_i = total_ms1_intensity(exp_capped); + + TEST_EQUAL(uncapped_i > 0.0, true); + TEST_EQUAL(capped_i > 0.0, true); + TEST_EQUAL(capped_i < uncapped_i, true); + + // Cadence invariant: RT cap must never drop a center frame. Both runs emit + // exactly one MS1 spectrum per raw MS1 frame regardless of the cap. + auto count_ms1_spectra = [](const MSExperiment& e) + { + Size n = 0; + for (const auto& s : e) if (s.getMSLevel() == 1) ++n; + return n; + }; + MSExperiment exp_raw; + f.load(OPENTIMS_DIA_TEST_DATA, exp_raw); + const Size raw_ms1 = count_ms1_spectra(exp_raw); + TEST_EQUAL(count_ms1_spectra(exp_uncapped), raw_ms1); + TEST_EQUAL(count_ms1_spectra(exp_capped), raw_ms1); + + STATUS("DIA MS1 RT cap: uncapped=" << uncapped_i + << " capped(0.5s, N=2)=" << capped_i + << " reduction=" << (1.0 - capped_i / uncapped_i)); +} +END_SECTION + +START_SECTION(FRAME mode ignores ms1_n_neighbors test) +{ + BrukerTimsFile f; + + // Reference: raw FRAME export with defaults + BrukerTimsFile::Config cfg_ref; + cfg_ref.export_mode = BrukerTimsFile::Config::FRAME; + MSExperiment exp_ref; + f.load(OPENTIMS_DIA_TEST_DATA, exp_ref, cfg_ref); + + // Same load but with aggregation knob set — must be ignored + BrukerTimsFile::Config cfg_agg; + cfg_agg.export_mode = BrukerTimsFile::Config::FRAME; + cfg_agg.ms1_n_neighbors = 1; + MSExperiment exp_agg; + f.load(OPENTIMS_DIA_TEST_DATA, exp_agg, cfg_agg); + + Size ref_ms1 = 0, agg_ms1 = 0; + Size ref_peaks = 0, agg_peaks = 0; + double ref_intensity = 0.0, agg_intensity = 0.0; + for (const auto& s : exp_ref) + if (s.getMSLevel() == 1) + { + ++ref_ms1; + ref_peaks += s.size(); + for (const auto& p : s) ref_intensity += p.getIntensity(); + } + for (const auto& s : exp_agg) + if (s.getMSLevel() == 1) + { + ++agg_ms1; + agg_peaks += s.size(); + for (const auto& p : s) agg_intensity += p.getIntensity(); + } + + TEST_NOT_EQUAL(ref_ms1, 0); + // FRAME mode must emit one spectrum per raw MS1 frame regardless of + // ms1_n_neighbors — aggregation is ignored. + TEST_EQUAL(agg_ms1, ref_ms1); + // Stronger: content must be identical too. Aggregation would inflate both + // peak count and cumulative intensity; FRAME mode's "raw frames" contract + // requires bit-for-bit behavior regardless of the ms1_n_neighbors knob. + TEST_EQUAL(agg_peaks, ref_peaks); + TEST_REAL_SIMILAR(agg_intensity, ref_intensity); + + // Warning emission is not asserted here — the test framework has no + // log-capture helper at present. Manual verification: the test output + // should show the "Warning: ms1_n_neighbors ... ignored" line. +} +END_SECTION + START_SECTION(DIA readDIAMetadata test) { BrukerTimsFile f; @@ -831,6 +1459,67 @@ START_SECTION(DIA loadDIAStreaming test) } END_SECTION +START_SECTION(DIA MS1 streaming aggregation test) +{ + BrukerTimsFile f; + + // Baseline: streaming with defaults (no aggregation) + ExperimentalSettings settings_ref; + auto meta_ref = f.readDIAMetadata(OPENTIMS_DIA_TEST_DATA, settings_ref); + RegularSwathFileConsumer consumer_ref(meta_ref.boundaries); + consumer_ref.setExperimentalSettings(settings_ref); + f.loadDIAStreaming(OPENTIMS_DIA_TEST_DATA, consumer_ref); + std::vector maps_ref; + consumer_ref.retrieveSwathMaps(maps_ref); + + // Aggregated streaming (ms1_n_neighbors=1) + ExperimentalSettings settings_agg; + auto meta_agg = f.readDIAMetadata(OPENTIMS_DIA_TEST_DATA, settings_agg); + RegularSwathFileConsumer consumer_agg(meta_agg.boundaries); + consumer_agg.setExperimentalSettings(settings_agg); + BrukerTimsFile::Config cfg; + cfg.ms1_n_neighbors = 1; + f.loadDIAStreaming(OPENTIMS_DIA_TEST_DATA, consumer_agg, cfg); + std::vector maps_agg; + consumer_agg.retrieveSwathMaps(maps_agg); + + // Locate each run's MS1 map by its ms1 flag (not by position). + auto find_ms1_map = [](const std::vector& maps) + -> const OpenSwath::SwathMap* { + for (const auto& m : maps) if (m.ms1) return &m; + return nullptr; + }; + const auto* ms1_ref = find_ms1_map(maps_ref); + const auto* ms1_agg = find_ms1_map(maps_agg); + TEST_NOT_EQUAL(ms1_ref, nullptr); + TEST_NOT_EQUAL(ms1_agg, nullptr); + + // Cadence invariant: both paths deliver one spectrum per raw MS1 frame. + TEST_EQUAL(ms1_ref->sptr->getNrSpectra(), + static_cast(meta_ref.nr_ms1_spectra)); + TEST_EQUAL(ms1_agg->sptr->getNrSpectra(), + static_cast(meta_agg.nr_ms1_spectra)); + + // Proof that aggregation actually ran: total MS1 peak count must be + // strictly higher in the aggregated path (stacked 2*N+1=3 frames of raw + // peaks into each output spectrum). If ms1_n_neighbors were silently + // ignored in loadDIAStreaming, the counts would be equal. + auto total_peaks = [](const OpenSwath::SwathMap& m) -> Size { + Size n = 0; + for (Size i = 0; i < m.sptr->getNrSpectra(); ++i) + n += m.sptr->getSpectrumById(i)->getMZArray()->data.size(); + return n; + }; + const Size peaks_ref = total_peaks(*ms1_ref); + const Size peaks_agg = total_peaks(*ms1_agg); + TEST_EQUAL(peaks_agg > peaks_ref, true); + + STATUS("DIA MS1 streaming aggregation: ref peaks=" << peaks_ref + << " agg peaks=" << peaks_agg + << " ratio=" << (static_cast(peaks_agg) / peaks_ref)); +} +END_SECTION + #endif // OPENTIMS_DIA_TEST_DATA END_TEST diff --git a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp index 46e987b5015..90c057e6426 100644 --- a/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp +++ b/src/tests/class_tests/openms/source/DigestionEnzymeProtein_test.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Xiao Liang $ -// $Authors: Xiao Liang $ +// $Authors: Xiao Liang, Alen Saric $ // -------------------------------------------------------------------------- // @@ -275,5 +275,37 @@ END_SECTION delete e_ptr; -END_TEST +START_SECTION((DigestionEnzymeProtein(const String& name, String cut_before, Sense sense, const String& nocut_after, const std::set& synonyms, String regex_description))) +{ + DigestionEnzymeProtein trypsin_style("TrypsinStyle", "K", DigestionEnzymeProtein::Sense::C_TERM, "P"); + TEST_EQUAL(trypsin_style.getRegEx(), "(?<=[KX])(?!P])") + + DigestionEnzymeProtein arg_c("Arg-C_Style", "R", DigestionEnzymeProtein::Sense::C_TERM); + TEST_EQUAL(arg_c.getRegEx(), "(?<=[RX])") + + DigestionEnzymeProtein n_term_test("N-Term_Style", "D", DigestionEnzymeProtein::Sense::N_TERM, "E"); + TEST_EQUAL(n_term_test.getRegEx(), "(? +#include + +////////////////////////////////////////// + +#include +#include + +using namespace OpenMS; +using namespace std; + +////////////////////////////////////////// + +START_TEST(DigestionEnzyme,"$ID") + +////////////////////////////////////////// + +DigestionEnzyme* e_ptr = nullptr; +DigestionEnzyme* e_null = nullptr; + + +START_SECTION((DigestionEnzyme())) + e_ptr = new DigestionEnzyme(); + TEST_NOT_EQUAL(e_ptr,e_null) +END_SECTION + +START_SECTION((virtual ~DigestionEnzyme())) + delete e_ptr; +END_SECTION + +START_SECTION(bool setValueFromFile(const String& key, const String& value)) + DigestionEnzyme enzyme; + + // Test the Name Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:Name","Trypsin"),true) + TEST_EQUAL(enzyme.getName(),"Trypsin") + + // Test the RegEx Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:RegEx","Reg"),true) + TEST_EQUAL(enzyme.getRegEx(), "Reg") + + // Test the RegExDescription Setting. + TEST_EQUAL(enzyme.setValueFromFile("test:RegExDescription","Desc"),true) + TEST_EQUAL(enzyme.getRegExDescription(),"Desc") + + // Test Synonym Setting + TEST_EQUAL(enzyme.setValueFromFile("syn:Synonyms:","Trypsin"),true) + TEST_EQUAL(enzyme.setValueFromFile("test:Synonyms:","TrypsinI"),true) + + // Since Synonyms are a set, test using set functions. + TEST_EQUAL(enzyme.getSynonyms().count("Trypsin"),1) + TEST_EQUAL(enzyme.getSynonyms().size(),2) + + // Test incorrect keys. + TEST_NOT_EQUAL(enzyme.setValueFromFile("test","Tryp-Like"),true) +END_SECTION + +START_SECTION(bool operator==(const String& cleavage_regex) const) + DigestionEnzyme enzyme; + enzyme.setRegEx("Verify"); + + TEST_EQUAL(enzyme == "Verify",true) + TEST_NOT_EQUAL(enzyme == "Accept",true) +END_SECTION + +START_SECTION(bool operator!=(const String& cleavage_regex) const) + DigestionEnzyme enzyme; + enzyme.setRegEx("Verify"); + + TEST_EQUAL(enzyme != "Accept",true) + TEST_NOT_EQUAL(enzyme != "Verify",true) +END_SECTION + +// < compares the names of the enzymes. +START_SECTION(bool operator<(const DigestionEnzyme& enzyme) const) + DigestionEnzyme e1,e2; + + e1.setName("A_Enzyme"); + e2.setName("B_ENZYME"); + + TEST_EQUAL(e1 < e2, true) + TEST_EQUAL(e2 < e1, false) + + // Safety test: when names are same, neither greater nor smaller, whatever the regex. + DigestionEnzyme e3; + e3.setName("A_Enzyme"); + e3.setRegEx("Greater"); + + TEST_NOT_EQUAL(e1 < e3, true) + TEST_NOT_EQUAL(e3 < e1, true) +END_SECTION + +START_SECTION(std::ostream& operator<<(std::ostream& os, const DigestionEnzyme& enzyme)) + DigestionEnzyme enzyme; + enzyme.setName("TestEnzyme"); + enzyme.setRegEx("[K]"); + enzyme.setRegExDescription("cuts at K"); + + stringstream ss; + ss << enzyme; + String output = ss.str(); + + TEST_EQUAL(output.hasSubstring("digestion enzyme:TestEnzyme"),true) + TEST_EQUAL(output.hasSubstring("(cleavage: [K] - cuts at K)"),true) +END_SECTION + +END_TEST diff --git a/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp b/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp index 9ea5cb3d7c0..7f6a419fbed 100644 --- a/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp +++ b/src/tests/class_tests/openms/source/LinearResamplerAlign_test.cpp @@ -3,19 +3,20 @@ // // -------------------------------------------------------------------------- // $Maintainer: Hannes Roest $ -// $Authors: Hannes Roest $ +// $Authors: Hannes Roest, Luis Jacob Keller, Alen Saric$ // -------------------------------------------------------------------------- #include -#include -#include #include -#include #include +#include +#include #include +#include + using namespace OpenMS; using namespace std; @@ -55,6 +56,19 @@ input_spectrum[3].setIntensity(2.0f); input_spectrum[4].setMZ(1.8); input_spectrum[4].setIntensity(1.0f); +START_SECTION(LinearResamplerAlign()) +{ + LinearResamplerAlign lr; + TEST_NOT_EQUAL(&lr, static_cast(nullptr)); +} +END_SECTION + +START_SECTION((~LinearResamplerAlign())) +{ + LinearResamplerAlign lr; +} +END_SECTION + // A spacing of 0.75 will lead to a recalculation of intensities, each // resampled point gets intensities from raw data points that are at most +/- // spacing away. @@ -112,7 +126,46 @@ START_SECTION([EXTRA] test_linear_res_chromat) } END_SECTION -START_SECTION(( void raster(ConstPeakTypeIterator raw_it, ConstPeakTypeIterator raw_end, PeakTypeIterator resample_it, PeakTypeIterator resample_end))) +START_SECTION(( template void rasterExperiment(MSExperiment& exp))) +{ + MSSpectrum spec; + spec.resize(5); + spec[0].setMZ(0); + spec[0].setIntensity(3.0f); + spec[1].setMZ(0.5); + spec[1].setIntensity(6.0f); + spec[2].setMZ(1.); + spec[2].setIntensity(8.0f); + spec[3].setMZ(1.6); + spec[3].setIntensity(2.0f); + spec[4].setMZ(1.8); + spec[4].setIntensity(1.0f); + + PeakMap exp; + exp.addSpectrum(spec); + exp.addSpectrum(spec); + + LinearResamplerAlign lr; + Param param; + param.setValue("spacing",0.5); + lr.setParameters(param); + lr.rasterExperiment(exp); + + + for (Size s=0; s -#include - - -#include - -#include -#include - -/////////////////////////// - -START_TEST(LinearResampler, "$Id$") - -///////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////// - -using namespace OpenMS; - -LinearResampler* lr_ptr = nullptr; -LinearResampler* lr_nullPointer = nullptr; - -START_SECTION((LinearResampler())) - lr_ptr = new LinearResampler; - TEST_NOT_EQUAL(lr_ptr, lr_nullPointer); -END_SECTION - -START_SECTION((~LinearResampler())) - delete lr_ptr; -END_SECTION - -START_SECTION((template void raster(MSSpectrum& spectrum))) -{ - MSSpectrum spec; - spec.resize(5); - spec[0].setMZ(0); - spec[0].setIntensity(3.0f); - spec[1].setMZ(0.5); - spec[1].setIntensity(6.0f); - spec[2].setMZ(1.); - spec[2].setIntensity(8.0f); - spec[3].setMZ(1.6); - spec[3].setIntensity(2.0f); - spec[4].setMZ(1.8); - spec[4].setIntensity(1.0f); - - LinearResampler lr; - Param param; - param.setValue("spacing",0.5); - lr.setParameters(param); - lr.raster(spec); - - double sum = 0.0; - for (Size i=0; i> raw points (regression test for index bug) -{ - // 3 raw points spanning a wide range, resampled at fine spacing - // This triggers the bug where right_index was clamped to number_raw_points-1 - // instead of number_resampled_points-1 - MSSpectrum spec; - spec.resize(3); - spec[0].setMZ(0); - spec[0].setIntensity(10.0f); - spec[1].setMZ(5.0); - spec[1].setIntensity(20.0f); - spec[2].setMZ(10.0); - spec[2].setIntensity(10.0f); - - LinearResampler lr; - Param param; - param.setValue("spacing", 0.1); - lr.setParameters(param); - lr.raster(spec); - - // resampled grid: 101 points (ceil((10-0)/0.1 + 1)) - TEST_EQUAL(spec.size(), 101); - - // total intensity must be conserved - double sum = 0.0; - for (Size i = 0; i < spec.size(); ++i) - { - sum += spec[i].getIntensity(); - } - TEST_REAL_SIMILAR(sum, 40.0); - - // no resampled point should exceed the max raw intensity - for (Size i = 0; i < spec.size(); ++i) - { - TEST_EQUAL(spec[i].getIntensity() <= 20.0 + 1e-5, true); - } -} -END_SECTION - -START_SECTION(( template void rasterExperiment(MSExperiment& exp))) -{ - MSSpectrum spec; - spec.resize(5); - spec[0].setMZ(0); - spec[0].setIntensity(3.0f); - spec[1].setMZ(0.5); - spec[1].setIntensity(6.0f); - spec[2].setMZ(1.); - spec[2].setIntensity(8.0f); - spec[3].setMZ(1.6); - spec[3].setIntensity(2.0f); - spec[4].setMZ(1.8); - spec[4].setIntensity(1.0f); - - PeakMap exp; - exp.addSpectrum(spec); - exp.addSpectrum(spec); - - LinearResampler lr; - Param param; - param.setValue("spacing",0.5); - lr.setParameters(param); - lr.rasterExperiment(exp); - - - for (Size s=0; s 0.0, true) + TEST_EQUAL(cal.precursor_spread > 0.0, true) + // |shift| < spread is the precondition for the writeback block (extreme_bias + // already asserted false above, but state it as a positive numerical check). + TEST_EQUAL(std::abs(cal.precursor_shift) < cal.precursor_spread, true) + + // Load-bearing check — guards the sign convention in runCalibrationPass_'s + // writeback block. Under (lower, upper) where signed error e lies in + // [-upper, +lower], the calibrated window [shift - spread, shift + spread] + // maps to: + // cal_lower = spread + shift (upper endpoint of the signed window) + // cal_upper = spread - shift (|lower endpoint| of the signed window) + // A regression of the swap would flip both of these. + TEST_REAL_SIMILAR(cal.cal_lower, cal.precursor_spread + cal.precursor_shift) + TEST_REAL_SIMILAR(cal.cal_upper, cal.precursor_spread - cal.precursor_shift) + // Positive bias => cal_lower > cal_upper. A regression of the swap would + // produce cal_upper > cal_lower — the assertion below catches that even + // if the functional identities above are tautologically consistent with a + // mislabeled spread/shift pair. + TEST_EQUAL(cal.cal_lower > cal.cal_upper, true) + // Both tightened from user-configured (20, 30); std::min cap inactive, so + // the functional identities above are unconstrained. TEST_EQUAL(cal.cal_lower < 20.0, true) TEST_EQUAL(cal.cal_upper < 30.0, true) - // And remain asymmetric — the positive bias means cal_upper > cal_lower. - TEST_EQUAL(cal.cal_upper > cal.cal_lower, true) // Post-search, the tolerance members have been RESTORED to the user-configured // values to avoid per-file state leaks in the multi-file wrapper (which reuses a // single ProSEAlgorithm instance across files). The calibrated diff --git a/src/tests/topp/CMakeLists.txt b/src/tests/topp/CMakeLists.txt index 1532b4157d6..c7af5628dd8 100644 --- a/src/tests/topp/CMakeLists.txt +++ b/src/tests/topp/CMakeLists.txt @@ -3000,6 +3000,17 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA" PROPERTIES DEPENDS "TOPP_SimpleSearchEngine_DDA") + # Floor-only PSM-count regression tripwire. Current yield on this fixture is + # 4187; floor set ~0.5% below so tiny tie-breaking jitter is absorbed but any + # real regression (silent native-ID or IM-annotation breakage, thirdparty + # algorithm drift) trips the check. If a legitimate improvement raises yield, + # raise the floor in the same commit. + add_test(NAME "TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/SimpleSearchEngine_DDA_fdr.tmp.idXML 4170) + set_tests_properties("TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_SimpleSearchEngineDDA") + add_test("TOPP_ProSE_DDA" ${TOPP_BIN_PATH}/ProSE -test -in ${OPENTIMS_DDA_SYMLINK} -database ${OPENTIMS_TEST_FASTA} @@ -3021,6 +3032,13 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA" PROPERTIES DEPENDS "TOPP_ProSE_DDA") + # Floor: 4373 current; guard ~0.5% below. + add_test(NAME "TOPP_FalseDiscoveryRate_ProSEDDA_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/ProSE_DDA_fdr.tmp.idXML 4350) + set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_ProSEDDA") + # Same DDA search with automatic calibration enabled add_test("TOPP_ProSE_DDA_calibrated" ${TOPP_BIN_PATH}/ProSE -test -in ${OPENTIMS_DDA_SYMLINK} @@ -3044,6 +3062,16 @@ if (WITH_OPENTIMS AND OPENTIMS_DDA_TEST_DATA) -protein false) set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_calibrated" PROPERTIES DEPENDS "TOPP_ProSE_DDA_calibrated") + # Floor: 4537 current (calibration should legitimately EXCEED uncalibrated yield); + # guard ~0.5% below. This test also protects the sign-convention fix in + # ProSEAlgorithm::runCalibrationPass_ — a regression of the swap would drop + # this count back to ~1441. + add_test(NAME "TOPP_FalseDiscoveryRate_ProSEDDA_calibrated_psm_count" + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/check_psm_floor.cmake + ${TESTS_TEMP_DIR}/ProSE_DDA_calibrated_fdr.tmp.idXML 4510) + set_tests_properties("TOPP_FalseDiscoveryRate_ProSEDDA_calibrated_psm_count" + PROPERTIES DEPENDS "TOPP_FalseDiscoveryRate_ProSEDDA_calibrated") + # Generate target-decoy database (shared by CometAdapter and SageAdapter DDA tests) add_test("TOPP_DecoyDatabase_DDA" ${TOPP_BIN_PATH}/DecoyDatabase -test -in ${OPENTIMS_TEST_FASTA} diff --git a/src/tests/topp/check_psm_floor.cmake b/src/tests/topp/check_psm_floor.cmake new file mode 100644 index 00000000000..25ec17a51d2 --- /dev/null +++ b/src/tests/topp/check_psm_floor.cmake @@ -0,0 +1,35 @@ +# Usage: cmake -P check_psm_floor.cmake +# +# Counts occurrences in an idXML file and fails if the count +# is below . Intended as a regression tripwire on PSM yields from the +# Bruker DDA opentims integration tests (1% FDR outputs). Floor-only: legitimate +# improvements that raise the count should NOT fail; only drops do. + +if(NOT DEFINED CMAKE_ARGV3 OR NOT DEFINED CMAKE_ARGV4) + message(FATAL_ERROR "Usage: cmake -P check_psm_floor.cmake ") +endif() + +set(file "${CMAKE_ARGV3}") +set(floor "${CMAKE_ARGV4}") + +if(NOT EXISTS "${file}") + message(FATAL_ERROR "PSM-count input not found: ${file}") +endif() + +# Use file(READ) + string(REGEX MATCHALL) rather than file(STRINGS ... REGEX). +# file(STRINGS) silently drops long lines/entries and produces wrong counts on +# idXML (observed: 186 vs the true 4537) — MATCHALL over the full file content +# is the reliable path. +file(READ "${file}" content) +string(REGEX MATCHALL " +#include + #ifdef WITH_OPENTIMS #include #endif @@ -697,13 +699,17 @@ class TOPPCometAdapter : const bool is_bruker_d = (FileHandler::getType(inputfile_name) == FileTypes::BRUKER_TDF); if (is_bruker_d) { - // Load .d via BrukerTimsFile + // Load .d via BrukerTimsFile. Skip MS1 loading for MS2 searches since + // Comet only searches MS2 — cuts load time substantially. auto bruker_config = getBrukerConfig_(); + if (ms_level == 2) + bruker_config.load_ms1 = false; BrukerTimsFile tims_file; tims_file.setLogType(log_type_); tims_file.load(inputfile_name, exp, bruker_config); - // Filter to target MS level only (Comet only needs MS2) + // Filter to target MS level only (Comet only needs MS2). Redundant when + // load_ms1=false for MS2 searches but kept for safety / MS1 searches. std::erase_if(exp.getSpectra(), [&](const MSSpectrum& s) { return s.getMSLevel() != static_cast(ms_level); }); OPENMS_LOG_INFO << "Loaded " << exp.size() << " MS" << ms_level << " spectra from Bruker .d directory." << std::endl; @@ -714,6 +720,27 @@ class TOPPCometAdapter : writeDebug_("Last native ID from .d: " + exp[exp.size() - 1].getNativeID(), 2); } + // Rewrite native IDs to "index=N" (monotonic) before the temp-mzML write. + // + // Bruker .d DDA native IDs are of the form "frame=F scan=S precursor=P". + // Comet's bundled mzParser (MSToolkit/saxmzmlhandler.cpp) greps for "scan=" + // anywhere in the id and uses atoi of the following digits as its sort + // key. Because `scan=S` here is the TIMS isolation-window start (not a + // monotonic counter), mzParser detects "unsorted" scans and falls into + // a std::sort with a strict-weak-ordering-violating comparator + // (cindex::compare returns true for equal values) → undefined behavior + // → segfault in std::string shuffling. + // + // Workaround: rewrite native IDs to `index=N` (counter path, no atoi + // on scan=) before handing the mzML to Comet. Preserve the original + // native ID on each spectrum so post-Comet PSMs can be translated back. + size_t idx = 0; + for (auto& spec : exp.getSpectra()) + { + spec.setMetaValue("original_native_id", spec.getNativeID()); + spec.setNativeID("index=" + String(idx++)); + } + // Write to temporary indexed mzML for Comet auto tmp_mzml = File::getTemporaryFile() + ".mzML"; MzMLFile().store(tmp_mzml, exp); @@ -820,9 +847,32 @@ class TOPPCometAdapter : // Parse FAIMS compensation voltage if present SpectrumMetaDataLookup::addMissingFAIMSToPeptideIDs(peptide_identifications, exp); +#ifdef WITH_OPENTIMS + // Translate PSM spectrum references back from "index=N" (the rewritten form + // we handed to Comet to work around the mzParser sort bug) to the original + // Bruker native ID ("frame=F scan=S precursor=P"). Only the Bruker branch + // set original_native_id; the mzML branch has no-op for this loop. + if (is_bruker_d && !exp.empty() && exp[0].metaValueExists("original_native_id")) + { + // Build rewritten-id → original-id map once (O(N) vs O(N*M) linear scan). + std::unordered_map id_map; + id_map.reserve(exp.size()); + for (const auto& spec : exp.getSpectra()) + { + if (spec.metaValueExists("original_native_id")) + id_map.emplace(spec.getNativeID(), spec.getMetaValue("original_native_id").toString()); + } + for (auto& pid : peptide_identifications) + { + auto it = id_map.find(pid.getSpectrumReference()); + if (it != id_map.end()) pid.setSpectrumReference(it->second); + } + } +#endif + // remove base_name meta value from peptide identifications for (auto& peptide_identification : peptide_identifications) - { + { peptide_identification.removeMetaValue("base_name"); } diff --git a/src/topp/FileConverter.cpp b/src/topp/FileConverter.cpp index b9d14ac77f3..bb49d77c3d8 100644 --- a/src/topp/FileConverter.cpp +++ b/src/topp/FileConverter.cpp @@ -184,6 +184,10 @@ class TOPPFileConverter : setMinFloat_("bruker:calibration_tolerance", 0.0); registerStringOption_("bruker:calibrate", "", "false", "Enable m/z recalibration (may fail on some datasets)", false, true); setValidStrings_("bruker:calibrate", {"true", "false"}); + registerStringOption_("bruker:load_ms1", "", "true", + "Load MS1 spectra. Disable for MS2-only workflows (peptide database search) " + "where MS1 surveys are not needed — substantially cuts memory and time. Affects all export modes.", false, true); + setValidStrings_("bruker:load_ms1", {"true", "false"}); registerStringOption_("bruker:export_mode", "", "auto", "Export mode: 'auto' detects DDA/DIA acquisition type, " "'spectrum' forces per-precursor MS2 spectra (DDA-style), 'frame' returns raw 4D frames without signal processing.", false, true); setValidStrings_("bruker:export_mode", {"auto", "spectrum", "frame"}); @@ -203,12 +207,42 @@ class TOPPFileConverter : setMinInt_("bruker:dia_ms2_n_neighbors", 0); registerIntOption_("bruker:dia_ms2_min_support", "", 1, "DIA MS2 denoising: minimum occupied neighbor cells in a 3x3 (m/z x IM) grid to keep a point " - "(center cell excluded from count). Applied after frame aggregation. Only effective when dia_ms2_n_neighbors > 0.", false, true); - setMinInt_("bruker:dia_ms2_min_support", 1); + "(center cell excluded from count). Applied after frame aggregation. Only effective when dia_ms2_n_neighbors > 0. " + "Set to 0 to disable denoising (useful for pure centroiding without noise filtering).", false, true); + setMinInt_("bruker:dia_ms2_min_support", 0); registerStringOption_("bruker:dia_ms2_centroid", "", "false", "Apply 2D Gaussian smoothing + local maxima peak picking to the denoised DIA MS2 grid. " "Produces IM_CENTROIDED spectra with sub-bin (m/z, IM) precision. Only effective when dia_ms2_n_neighbors > 0.", false, true); setValidStrings_("bruker:dia_ms2_centroid", {"true", "false"}); + + registerIntOption_("bruker:ms1_n_neighbors", "", 0, + "MS1 frame aggregation: number of adjacent MS1 frames on each side to sum. " + "0 = disabled (raw export), 1 = 3-frame sum, 2 = 5-frame sum. " + "Applies to both DIA and DDA; ignored in FRAME export mode.", false, true); + setMinInt_("bruker:ms1_n_neighbors", 0); + setMaxInt_("bruker:ms1_n_neighbors", 50); + + registerIntOption_("bruker:ms1_min_support", "", 0, + "MS1 denoising: minimum occupied neighbor cells in a 3x3 (m/z x IM) grid to keep a point. " + "Applied after aggregation. 0 = disabled, 8 = all 8 neighbors required (strictest). " + "Only effective when ms1_n_neighbors > 0. Appropriate for dense survey runs; disable for " + "rare-species discovery.", false, true); + setMinInt_("bruker:ms1_min_support", 0); + setMaxInt_("bruker:ms1_min_support", 8); + + registerDoubleOption_("bruker:ms1_max_rt_distance_sec", "", 0.0, + "Cap the RT distance (seconds) between a neighbor MS1 frame and the center frame during " + "aggregation. 0.0 = no cap. Recommended for DDA (e.g. 5.0) where MS1 frame cadence is " + "irregular. The center frame is always included regardless of this cap.", false, true); + setMinFloat_("bruker:ms1_max_rt_distance_sec", 0.0); + + registerIntOption_("bruker:ms1_centroid_max_peaks", "", 100000, + "Cap on the number of centroided peaks retained per MS1 spectrum. Top-intensity peaks " + "are kept; low-intensity tail is dropped if the limit is hit (a warning is logged in that " + "case). Only effective when MS1 centroiding is enabled via ms1_centroid_mz_ppm/pct. Raise " + "for aggregated MS1 (ms1_n_neighbors > 0) on dense surveys; lower to trim long-tail noise.", + false, true); + setMinInt_("bruker:ms1_centroid_max_peaks", 1); #endif registerTOPPSubsection_("RawToMzML", "Options for converting raw files to mzML (uses ThermoRawFileParser)"); @@ -236,6 +270,7 @@ class TOPPFileConverter : BrukerTimsFile::Config c; c.calibration_tolerance = getDoubleOption_("bruker:calibration_tolerance"); c.calibrate = (getStringOption_("bruker:calibrate") == "true"); + c.load_ms1 = (getStringOption_("bruker:load_ms1") == "true"); String mode = getStringOption_("bruker:export_mode"); if (mode == "spectrum") c.export_mode = BrukerTimsFile::Config::SPECTRUM; else if (mode == "frame") c.export_mode = BrukerTimsFile::Config::FRAME; @@ -245,6 +280,10 @@ class TOPPFileConverter : c.dia_ms2_n_neighbors = getIntOption_("bruker:dia_ms2_n_neighbors"); c.dia_ms2_min_support = getIntOption_("bruker:dia_ms2_min_support"); c.dia_ms2_centroid = (getStringOption_("bruker:dia_ms2_centroid") == "true"); + c.ms1_n_neighbors = getIntOption_("bruker:ms1_n_neighbors"); + c.ms1_min_support = getIntOption_("bruker:ms1_min_support"); + c.ms1_max_rt_distance_sec = getDoubleOption_("bruker:ms1_max_rt_distance_sec"); + c.ms1_centroid_max_peaks = getIntOption_("bruker:ms1_centroid_max_peaks"); return c; } #endif diff --git a/src/topp/ProSE.cpp b/src/topp/ProSE.cpp index bad739a0d09..d45db160f75 100644 --- a/src/topp/ProSE.cpp +++ b/src/topp/ProSE.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include // only for std::shared_ptr declarations @@ -102,6 +103,9 @@ class ProSE : registerOutputFile_("out_merged", "", "", "Optional merged output file containing all PSMs pooled across input files with cross-file protein inference (BasicProteinInferenceAlgorithm) and optional picked-protein FDR (FDR:protein). Per-file outputs (-out_idxml / -out_qpx / -out_parquet) retain run-level information. Only useful with multiple -in files.", false); setValidFormats_("out_merged", ListUtils::create("idXML")); + registerOutputFileList_("out_pin", "", StringList(), "Output Percolator input (.pin/.tsv) file(s) for external rescoring. Must have the same number of entries as -in. Written independently of -percolator_executable (i.e. you can produce .pin files without running Percolator).", false, true); + setValidFormats_("out_pin", ListUtils::create("tsv")); + registerOutputDir_("out_mod_analysis_dir", "

", "", "Optional directory to write modification-analysis tables (delta-mass, PTM stats) when running in open-search mode. When set, per-file tables are written using each input file's basename and an additional aggregate table is written across all input files. Has no effect in closed-search mode.", false, true); registerInputFile_("percolator_executable", "", @@ -123,6 +127,7 @@ class ProSE : const StringList in_list = getStringList_("in"); const String database = getStringOption_("database"); const StringList out_idxml_list = getStringList_("out_idxml"); + const StringList out_pin_list = getStringList_("out_pin"); const String out_merged = getStringOption_("out_merged"); const String out_qpx_dir = getOutputDirOption("out_qpx"); const String out_parquet_dir = getOutputDirOption("out_parquet"); @@ -136,9 +141,9 @@ class ProSE : } // At least one output must be specified - if (out_idxml_list.empty() && out_qpx_dir.empty() && out_parquet_dir.empty()) + if (out_idxml_list.empty() && out_pin_list.empty() && out_qpx_dir.empty() && out_parquet_dir.empty()) { - OPENMS_LOG_ERROR << "No output specified. Provide at least one of -out_idxml, -out_qpx, or -out_parquet." << endl; + OPENMS_LOG_ERROR << "No output specified. Provide at least one of -out_idxml, -out_pin, -out_qpx, or -out_parquet." << endl; return ILLEGAL_PARAMETERS; } @@ -150,6 +155,14 @@ class ProSE : return ILLEGAL_PARAMETERS; } + // -out_pin count must match -in count + if (!out_pin_list.empty() && in_list.size() != out_pin_list.size()) + { + OPENMS_LOG_ERROR << "Number of output files (-out_pin, " << out_pin_list.size() + << ") must match number of input files (-in, " << in_list.size() << ")." << endl; + return ILLEGAL_PARAMETERS; + } + // Validate basenames are unique (for parquet directory output) if (!out_qpx_dir.empty() || !out_parquet_dir.empty()) { @@ -435,6 +448,51 @@ class ProSE : } } + // -- Percolator .pin output -- + if (!out_pin_list.empty()) + { + try + { + if (result.protein_ids.empty()) + { + OPENMS_LOG_ERROR << "Cannot write .pin for " << in_file + << ": no ProteinIdentification/search parameters available." << endl; + input_failed = true; + } + else + { + const auto& sp = result.protein_ids.front().getSearchParameters(); + StringList feature_set; + if (sp.metaValueExists("extra_features")) + { + feature_set = ListUtils::create(sp.getMetaValue("extra_features").toString()); + } + if (std::find(feature_set.begin(), feature_set.end(), "score") == feature_set.end()) + { + feature_set.insert(feature_set.begin(), "score"); + } + feature_set.push_back("Peptide"); + feature_set.push_back("Proteins"); + const String enz_str = sp.digestion_enzyme.getName(); + const auto colon = sp.charges.find(':'); + const int min_charge = (colon != String::npos) + ? String(sp.charges.substr(0, colon)).toInt() + : String(sp.charges).toInt(); + const int max_charge = (colon != String::npos) + ? String(sp.charges.substr(colon + 1)).toInt() + : min_charge; + PercolatorInfile::store(out_pin_list[i], result.peptide_ids, + feature_set, enz_str, min_charge, max_charge); + } + } + catch (const Exception::BaseException& e) + { + OPENMS_LOG_ERROR << "Failed to write .pin output for " << in_file + << " -> " << out_pin_list[i] << ": " << e.what() << endl; + input_failed = true; + } + } + // -- QPX directory output -- if (!out_qpx_dir.empty()) { diff --git a/src/topp/Resampler.cpp b/src/topp/Resampler.cpp index d046a1a9e50..358dd21c38e 100644 --- a/src/topp/Resampler.cpp +++ b/src/topp/Resampler.cpp @@ -119,7 +119,7 @@ class TOPPResampler : resampler_param.setValue("spacing", sampling_rate); resampler_param.setValue("ppm", ppm ? "true" : "false"); - LinearResamplerAlign lin_resampler; // LinearResampler does not know about ppm! + LinearResamplerAlign lin_resampler; lin_resampler.setParameters(resampler_param); if (!align_sampling) {