From 08e8ffd7cc5d01a58985a0b0931276feae03798a Mon Sep 17 00:00:00 2001 From: guillaume-osmo Date: Wed, 10 Jun 2026 10:53:38 +0200 Subject: [PATCH] osmordred: add smarts291 Abraham SMARTS feature set (291 features) Adds the SMARTS291 descriptor set: 291 SMARTS-based features for Abraham solvation-parameter prediction (A/S/B/E/L/V models) computed in C++, exposed via CalcAbrahamFeatures, ExtractSMARTS291Batch, ExtractSMARTS291FromMolsBatch, and HasSMARTS291Support. - Code/GraphMol/Descriptors/smarts291/{SMARTS291.h,abraham_integration.cpp, abraham_queries.cpp,abraham_queries.h,test_smarts291.cpp} - CMake wiring + Python wrapper bindings under RDK_BUILD_OSMORDRED. - Distinct symbols from osmordred's calcAbrahams (no collision); CalcAbrahams binding already present in the base, so not re-added. Independent of the osmordredv3 PR and the rdkit217 PR; branches off osmordred. Assisted by Claude --- Code/GraphMol/Descriptors/CMakeLists.txt | 8 + .../Descriptors/Wrap/rdMolDescriptors.cpp | 60 ++ .../Descriptors/smarts291/SMARTS291.h | 92 ++++ .../smarts291/abraham_integration.cpp | 243 +++++++++ .../Descriptors/smarts291/abraham_queries.cpp | 512 ++++++++++++++++++ .../Descriptors/smarts291/abraham_queries.h | 21 + .../Descriptors/smarts291/test_smarts291.cpp | 250 +++++++++ 7 files changed, 1186 insertions(+) create mode 100644 Code/GraphMol/Descriptors/smarts291/SMARTS291.h create mode 100644 Code/GraphMol/Descriptors/smarts291/abraham_integration.cpp create mode 100644 Code/GraphMol/Descriptors/smarts291/abraham_queries.cpp create mode 100644 Code/GraphMol/Descriptors/smarts291/abraham_queries.h create mode 100644 Code/GraphMol/Descriptors/smarts291/test_smarts291.cpp diff --git a/Code/GraphMol/Descriptors/CMakeLists.txt b/Code/GraphMol/Descriptors/CMakeLists.txt index 30dada25d16..c646ad020de 100644 --- a/Code/GraphMol/Descriptors/CMakeLists.txt +++ b/Code/GraphMol/Descriptors/CMakeLists.txt @@ -22,6 +22,11 @@ if(RDK_BUILD_OSMORDRED) OsmordredTopologicalConnectivityShape.cpp OsmordredMatrixAutocorrEStateFragments.cpp) + # smarts291: Abraham SMARTS-based features (291 features) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/smarts291) + set(SMARTS291_HDRS smarts291/SMARTS291.h smarts291/abraham_queries.h) + set(SMARTS291_SOURCES smarts291/abraham_integration.cpp smarts291/abraham_queries.cpp) + include_directories(Descriptors LAPACK::LAPACK ${LAPACK_INCLUDE_DIRS}) endif(RDK_BUILD_OSMORDRED) @@ -35,6 +40,7 @@ rdkit_library(Descriptors OxidationNumbers.cpp DCLV.cpp ${OSMORDRED_SOURCES} + ${SMARTS291_SOURCES} ${DESC3D_SOURCES} LINK_LIBRARIES DataStructs Fingerprints PartialCharges SmilesParse FileParsers Subgraphs SubstructMatch MolTransforms GraphMol EigenSolvers RDGeneral) @@ -51,6 +57,7 @@ rdkit_headers(Crippen.h BCUT.h Lipinski.h OxidationNumbers.h DCLV.h ${OSMORDRED_HDRS} + ${SMARTS291_HDRS} ${DESC3D_HDRS} DEST GraphMol/Descriptors) @@ -95,6 +102,7 @@ rdkit_catch_test(descriptorsTestCatch catch_tests.cpp LINK_LIBRARIES Descriptors if(RDK_BUILD_OSMORDRED) rdkit_catch_test(testOsmordred test_osmordred.cpp LINK_LIBRARIES Descriptors SmilesParse FileParsers) + rdkit_catch_test(testSMARTS291 smarts291/test_smarts291.cpp LINK_LIBRARIES Descriptors SmilesParse FileParsers) endif(RDK_BUILD_OSMORDRED) if(RDK_BUILD_PYTHON_WRAPPERS) diff --git a/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp b/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp index 05f6b23f541..cfaabf2b906 100644 --- a/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp +++ b/Code/GraphMol/Descriptors/Wrap/rdMolDescriptors.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -2226,5 +2227,64 @@ BOOST_PYTHON_MODULE(rdMolDescriptors) { python::def("HasOsmordredSupport", hasOsmordredSupport, "Returns True if the RDKit is compiled with osmordred support, False otherwise.\n" "If false, all osmordred functions return zero or empty vectors."); + + // ========================================================================= + // SMARTS291: Abraham SMARTS-based features (291 features) + // ========================================================================= + python::def("HasSMARTS291Support", RDKit::Descriptors::SMARTS291::hasSMARTS291Support, + "Check if SMARTS291 support is available.\n" + "Returns: True if SMARTS291 features can be computed.\n"); + + python::def("CalcAbrahamFeatures", RDKit::Descriptors::Osmordred::calcAbrahamsFeatures, + "Calculate 291 Abraham SMARTS-based features for molecular property prediction.\n" + "Returns: vector of 291 double values (241 base SMARTS + 50 golden ratio features)\n" + "These features are used for physicochemical property prediction (V, E, L, B, S, A).\n"); + + // Batch wrapper for SMARTS291 with multi-threading (accepts Python list of SMILES strings) + auto smarts291_batch_impl = +[](python::list smiles_py, char param, int n_jobs) { + std::vector smiles_list; + smiles_list.reserve(python::len(smiles_py)); + for (int i = 0; i < python::len(smiles_py); ++i) { + python::object obj = smiles_py[i]; + if (obj.is_none()) { + smiles_list.push_back(""); // Empty string for invalid SMILES + } else { + smiles_list.push_back(python::extract(obj)); + } + } + return RDKit::Descriptors::SMARTS291::extractSMARTS291Batch(smiles_list, param, n_jobs); + }; + python::def("ExtractSMARTS291Batch", smarts291_batch_impl, + (python::arg("smiles_list"), python::arg("param")='A', python::arg("n_jobs")=0), + "Extract 291 SMARTS-based Abraham features from SMILES in parallel.\n" + "Input: list of SMILES strings, param (model type: 'A' default), n_jobs (0=auto)\n" + "Output: list of 291-feature vectors (241 base + 50 golden features)\n" + "Uses parallel processing when n_jobs > 0 (0 = auto-detect CPU count).\n"); + + // Batch wrapper for SMARTS291 from Mol objects (accepts Python list of Mol objects) + auto smarts291_from_mols_impl = +[](python::list mols_py, char param, int n_jobs) { + std::vector mols; + mols.reserve(python::len(mols_py)); + for (int i = 0; i < python::len(mols_py); ++i) { + python::object obj = mols_py[i]; + if (obj.is_none()) { + mols.push_back(nullptr); + } else { + mols.push_back(python::extract(obj)); + } + } + return RDKit::Descriptors::SMARTS291::extractSMARTS291FromMolsBatch(mols, param, n_jobs); + }; + python::def("ExtractSMARTS291FromMolsBatch", smarts291_from_mols_impl, + (python::arg("mols"), python::arg("param")='A', python::arg("n_jobs")=0), + "Extract 291 SMARTS-based Abraham features from Mol objects in parallel.\n" + "Input: list of RDKit Mol objects, param (model type: 'A' default), n_jobs (0=auto)\n" + "Output: list of 291-feature vectors (NaN for invalid molecules).\n"); + + python::def("GetSMARTS291FeatureNames", RDKit::Descriptors::SMARTS291::getSMARTS291FeatureNames, + (python::arg("param")='A'), + "Get the 291 SMARTS feature names (241 base + 50 golden for specified model).\n" + "Returns: vector of 291 strings with feature names.\n"); + #endif // osmordred } diff --git a/Code/GraphMol/Descriptors/smarts291/SMARTS291.h b/Code/GraphMol/Descriptors/smarts291/SMARTS291.h new file mode 100644 index 00000000000..390aea6b622 --- /dev/null +++ b/Code/GraphMol/Descriptors/smarts291/SMARTS291.h @@ -0,0 +1,92 @@ +// Copyright (c) 2025, Guillaume Godin Osmo Labs, PBC's and others +// All rights reserved. +// +// SMARTS291 - Abraham SMARTS-based Features +// +// This module provides 291 SMARTS-based features for molecular property prediction. +// The features consist of: +// - 241 base features: SMARTS pattern counts (sorted alphabetically) +// - 50 golden features: Ratio features derived from base features +// +// These features are designed for Abraham parameter prediction (A, B, E, L, S, V) +// and are used as input to machine learning models for physicochemical property prediction. + +#ifndef SMARTS291_H +#define SMARTS291_H + +#include +#include +#include +#include + +namespace RDKit { +namespace Descriptors { +namespace SMARTS291 { + +// Check if SMARTS291 support is available +RDKIT_DESCRIPTORS_EXPORT bool hasSMARTS291Support(); + +// Extract 241 base SMARTS features +// These are SMARTS pattern match counts, sorted alphabetically by feature name +// Returns: vector of 241 double values (count of matches for each SMARTS pattern) +RDKIT_DESCRIPTORS_EXPORT std::vector extractBaseFeatures(const RDKit::ROMol& mol); + +// Generate 50 golden features from base features +// These are ratio features: baseFeatures[i] / baseFeatures[j] +// Different Abraham parameters (A, B, E, L, S, V) use different golden feature definitions +// Returns: vector of 50 double values (0.0 if denominator is 0) +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesA(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesS(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesB(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesE(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesL(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesV(const std::vector& baseFeatures); + +// Extract all 291 SMARTS features for a given Abraham parameter +// Returns: vector of 291 double values (241 base + 50 golden) +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_A(const RDKit::ROMol& mol); +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_S(const RDKit::ROMol& mol); +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_B(const RDKit::ROMol& mol); +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_E(const RDKit::ROMol& mol); +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_L(const RDKit::ROMol& mol); +RDKIT_DESCRIPTORS_EXPORT std::vector extractSMARTS291_V(const RDKit::ROMol& mol); + +// Batch extraction from SMILES list +RDKIT_DESCRIPTORS_EXPORT std::vector> extractSMARTS291Batch( + const std::vector& smiles_list, char param = 'A', int n_jobs = 0); + +// Batch extraction from Mol objects +RDKIT_DESCRIPTORS_EXPORT std::vector> extractSMARTS291FromMolsBatch( + const std::vector& mols, char param = 'A', int n_jobs = 0); + +// Get feature names +RDKIT_DESCRIPTORS_EXPORT std::vector getBaseFeatureNames(); +RDKIT_DESCRIPTORS_EXPORT std::vector getGoldenFeatureNames(char param = 'A'); +RDKIT_DESCRIPTORS_EXPORT std::vector getSMARTS291FeatureNames(char param = 'A'); + +} // namespace SMARTS291 + +// Legacy Osmordred namespace for compatibility +namespace Osmordred { + +// Extract 241 base features using SMARTS patterns +RDKIT_DESCRIPTORS_EXPORT std::vector extractAbrahamBaseFeatures(const RDKit::ROMol& mol); + +// Generate 50 golden features (ratio features) +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesA(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesS(const std::vector& baseFeatures); +RDKIT_DESCRIPTORS_EXPORT std::vector generateGoldenFeaturesRidge(const std::vector& baseFeatures); + +// Calculate 291 Abraham features (241 base + 50 golden for A model) +RDKIT_DESCRIPTORS_EXPORT std::vector calcAbrahamsFeatures(const RDKit::ROMol& mol); + +#ifdef HAVE_ABRAHAM_MODELS +// Full Abraham parameter prediction (requires trained models) +RDKIT_DESCRIPTORS_EXPORT std::vector calcAbrahams(const RDKit::ROMol& mol); +#endif + +} // namespace Osmordred +} // namespace Descriptors +} // namespace RDKit + +#endif // SMARTS291_H diff --git a/Code/GraphMol/Descriptors/smarts291/abraham_integration.cpp b/Code/GraphMol/Descriptors/smarts291/abraham_integration.cpp new file mode 100644 index 00000000000..c44afea7a7e --- /dev/null +++ b/Code/GraphMol/Descriptors/smarts291/abraham_integration.cpp @@ -0,0 +1,243 @@ +// CLEAN Abraham Integration - Uses new query system +// Generated to match trained model EXACTLY +// EACH MODEL TYPE USES ITS OWN GOLDEN FEATURES! + +#include "SMARTS291.h" +#include "abraham_queries.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace RDKit { +namespace Descriptors { +namespace Osmordred { + +std::vector extractAbrahamBaseFeatures(const RDKit::ROMol& mol) { + std::vector features; + features.reserve(241); + + auto queries = GetQueriesAbrahamBaseFeatures(); + size_t max_queries = (queries.size() > 241) ? 241 : queries.size(); + + RDKit::RWMol mol_rw(mol); + + for (size_t i = 0; i < max_queries; ++i) { + if (queries[i]) { + std::vector matches; + RDKit::SubstructMatch(mol_rw, *queries[i], matches, true); + features.push_back(static_cast(matches.size())); + } else { + features.push_back(0.0); + } + } + + while (features.size() < 241) { + features.push_back(0.0); + } + + return features; +} + +std::vector generateGoldenFeaturesA(const std::vector& baseFeatures) { + std::vector goldenFeatures; + goldenFeatures.reserve(50); + + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[89] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[29] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[91] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[108] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[29] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[39] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[54] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[0] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[54] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[34] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[18] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[90] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[90] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[18] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[89] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[55] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[108] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[2] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[39] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[91] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[51] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[9] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[107] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[51] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[13] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[93] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[9] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[55] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[93] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[11] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[52] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[92] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[92] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[53] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[2] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[53] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[52] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[56] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[34] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[56] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[59] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[85] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[85] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[38] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[110] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[110] / baseFeatures[127] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[60] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[59] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[113] != 0.0 ? baseFeatures[3] / baseFeatures[113] : 0.0); + goldenFeatures.push_back(baseFeatures[127] != 0.0 ? baseFeatures[84] / baseFeatures[127] : 0.0); + + return goldenFeatures; +} + +std::vector calcAbrahamsFeatures(const RDKit::ROMol& mol) { + std::vector baseFeatures = extractAbrahamBaseFeatures(mol); + std::vector goldenA = generateGoldenFeaturesA(baseFeatures); + + std::vector allFeatures = baseFeatures; + allFeatures.insert(allFeatures.end(), goldenA.begin(), goldenA.end()); + + return allFeatures; +} + +} // namespace Osmordred + +namespace SMARTS291 { + +bool hasSMARTS291Support() { + return true; +} + +std::vector getBaseFeatureNames() { + std::vector names; + names.reserve(241); + for (int i = 0; i < 241; ++i) { + names.push_back("base_" + std::to_string(i)); + } + return names; +} + +std::vector getGoldenFeatureNames(char param) { + std::vector names; + names.reserve(50); + for (int i = 0; i < 50; ++i) { + names.push_back(std::string("golden_") + param + "_" + std::to_string(i)); + } + return names; +} + +std::vector getSMARTS291FeatureNames(char param) { + std::vector names = getBaseFeatureNames(); + std::vector golden = getGoldenFeatureNames(param); + names.insert(names.end(), golden.begin(), golden.end()); + return names; +} + +std::vector> extractSMARTS291Batch( + const std::vector& smiles_list, char /*param*/, int n_jobs) { + + std::vector> results; + results.reserve(smiles_list.size()); + + unsigned int nThreads = getNumThreadsToUse(n_jobs); + + if (nThreads <= 1 || smiles_list.size() < 10) { + for (const auto& smi : smiles_list) { + ROMol* mol = SmilesToMol(smi); + if (mol) { + results.push_back(Osmordred::calcAbrahamsFeatures(*mol)); + delete mol; + } else { + results.push_back(std::vector(291, 0.0)); + } + } + return results; + } + + std::vector>> futures; + futures.reserve(smiles_list.size()); + + for (const auto& smi : smiles_list) { + futures.emplace_back(std::async(std::launch::async, [smi]() { + try { + ROMol* mol = SmilesToMol(smi); + if (mol) { + std::vector feats = Osmordred::calcAbrahamsFeatures(*mol); + delete mol; + return feats; + } + } catch (...) {} + return std::vector(291, 0.0); + })); + } + + for (auto& f : futures) { + results.push_back(f.get()); + } + + return results; +} + +std::vector> extractSMARTS291FromMolsBatch( + const std::vector& mols, char /*param*/, int n_jobs) { + + const double kNaN = std::numeric_limits::quiet_NaN(); + const std::vector nanRow(291, kNaN); + + std::vector> results; + results.reserve(mols.size()); + + unsigned int nThreads = getNumThreadsToUse(n_jobs); + + if (nThreads <= 1 || mols.size() < 10) { + for (const auto* mol : mols) { + if (mol) { + try { + results.push_back(Osmordred::calcAbrahamsFeatures(*mol)); + } catch (...) { + results.push_back(nanRow); + } + } else { + results.push_back(nanRow); + } + } + return results; + } + + std::vector>> futures; + futures.reserve(mols.size()); + + for (const auto* mol : mols) { + futures.emplace_back(std::async(std::launch::async, [mol, &nanRow]() { + if (mol) { + try { + return Osmordred::calcAbrahamsFeatures(*mol); + } catch (...) {} + } + return nanRow; + })); + } + + for (auto& f : futures) { + results.push_back(f.get()); + } + + return results; +} + +} // namespace SMARTS291 +} // namespace Descriptors +} // namespace RDKit diff --git a/Code/GraphMol/Descriptors/smarts291/abraham_queries.cpp b/Code/GraphMol/Descriptors/smarts291/abraham_queries.cpp new file mode 100644 index 00000000000..b6bf40be4f2 --- /dev/null +++ b/Code/GraphMol/Descriptors/smarts291/abraham_queries.cpp @@ -0,0 +1,512 @@ +// Auto-generated query functions for Abraham features +// These queries are in the EXACT order the trained model expects + +#include +#include +#include +#include +#include + +namespace RDKit { +namespace Descriptors { +namespace Osmordred { + +// Get 241 base feature queries in EXACT model order +// Thread-safe initialization using C++11 static local variable (guaranteed thread-safe) +const std::vector>& GetQueriesAbrahamBaseFeatures() { + static std::vector> queries; + static std::once_flag init_flag; + + std::call_once(init_flag, []() { + queries.reserve(241); + + // [0] abraham_A_additional_0 + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX4][CX4][OH]"))); + // [1] abraham_A_additional_1 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccccc1[OH]"))); + // [2] abraham_A_additional_10 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccccc1[NX3](=O)=O"))); + // [3] abraham_A_additional_11 + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[OH]c1ccccc1"))); + // [4] abraham_A_additional_12 + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX3]=[CX3]"))); + // [5] abraham_A_additional_13 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OH][CX4][OH]"))); + // [6] abraham_A_additional_14 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OH][NX3]"))); + // [7] abraham_A_additional_2 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccc([OH])cc1"))); + // [8] abraham_A_additional_3 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H2][CX4][OH]"))); + // [9] abraham_A_additional_4 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H1][CX4][OH]"))); + // [10] abraham_A_additional_5 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OH]c1ccccc1"))); + // [11] abraham_A_additional_6 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OH]c1ccccc1[OH]"))); + // [12] abraham_A_additional_7 + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX4][CX3](=O)"))); + // [13] abraham_A_additional_8 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccccc1[CX3](=O)"))); + // [14] abraham_A_additional_9 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccccc1[F,Cl,Br,I]"))); + // [15] abraham_A_frag_0 + queries.push_back(std::shared_ptr(SmartsToMol("[C][OX2H]"))); + // [16] abraham_A_frag_1 + queries.push_back(std::shared_ptr(SmartsToMol("[c][OX2H]"))); + // [17] abraham_A_frag_10 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=[OX1])[NX3;H1][C]"))); + // [18] abraham_A_frag_11 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=[OX1])[NX3;H1][c]"))); + // [19] abraham_A_frag_12 + queries.push_back(std::shared_ptr(SmartsToMol("[$([SX4](=[OX1])(=[OX1])([!O])[NH,NH2,NH3+]),$([SX4+2]([OX1-])([OX1-])([!O])[NH,NH2,NH3+])]"))); + // [20] abraham_A_frag_13 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H1]C(=[OX1])[NX3;H1]"))); + // [21] abraham_A_frag_14 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H0]C(=[OX1])[NX3;H1]"))); + // [22] abraham_A_frag_15 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H1]C(=[OX1])O"))); + // [23] abraham_A_frag_16 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H1]C(=N)[NX3;H0]"))); + // [24] abraham_A_frag_17 + queries.push_back(std::shared_ptr(SmartsToMol("[C]#[CH]"))); + // [25] abraham_A_frag_18 + queries.push_back(std::shared_ptr(SmartsToMol("P[OH,O-]"))); + // [26] abraham_A_frag_19 + queries.push_back(std::shared_ptr(SmartsToMol("[CH][F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)]"))); + // [27] abraham_A_frag_2 + queries.push_back(std::shared_ptr(SmartsToMol("[C][NX3;H2]"))); + // [28] abraham_A_frag_20 + queries.push_back(std::shared_ptr(SmartsToMol("[CH]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)]"))); + // [29] abraham_A_frag_21 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX3](=O)[OX1H0-,OX2H1])[CX4][CX3](=O)[OX1H0-,OX2H1]"))); + // [30] abraham_A_frag_22 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[CX3](=O)[OX1H0-,OX2H1]"))); + // [31] abraham_A_frag_23 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[OH]"))); + // [32] abraham_A_frag_24 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[CX4][OH]"))); + // [33] abraham_A_frag_25 + queries.push_back(std::shared_ptr(SmartsToMol("[nX3;H1]:n"))); + // [34] abraham_A_frag_26 + queries.push_back(std::shared_ptr(SmartsToMol("[nX3;H1]:c:n"))); + // [35] abraham_A_frag_27 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H1]CC[O,N]"))); + // [36] abraham_A_frag_28 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H1]C[C,N]=[O,S]"))); + // [37] abraham_A_frag_29 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H1]c1ccccc1[O,NX3]"))); + // [38] abraham_A_frag_3 + queries.push_back(std::shared_ptr(SmartsToMol("[c][NX3;H2;!$(NC=O)]"))); + // [39] abraham_A_frag_30 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H1]c1ccccc1C=[O,S]"))); + // [40] abraham_A_frag_31 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H1]c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]"))); + // [41] abraham_A_frag_32 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]CC[O,N]"))); + // [42] abraham_A_frag_33 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]c1ccccc1[O,N]"))); + // [43] abraham_A_frag_34 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]c1ccccc1[C,N]=[O,S]"))); + // [44] abraham_A_frag_35 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]c1ccccc1[Cl,Br,I]"))); + // [45] abraham_A_frag_36 + queries.push_back(std::shared_ptr(SmartsToMol("[OX1]=[C,c]~[C,c]C[OH]"))); + // [46] abraham_A_frag_37 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1cccc2cccnc12"))); + // [47] abraham_A_frag_38 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1cc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])ccc1"))); + // [48] abraham_A_frag_39 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])cc1"))); + // [49] abraham_A_frag_4 + queries.push_back(std::shared_ptr(SmartsToMol("[C][NX3;H1;!R][C]"))); + // [50] abraham_A_frag_40 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]c1cc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])ccc1"))); + // [51] abraham_A_frag_41 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]c1ccc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])cc1"))); + // [52] abraham_A_frag_42 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)([OX1H0-,OX2H1])c1cc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])ccc1"))); + // [53] abraham_A_frag_43 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)([OX1H0-,OX2H1])c1ccc([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])cc1"))); + // [54] abraham_A_frag_44 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1c([CX4])cccc1[CX4]"))); + // [55] abraham_A_frag_45 + queries.push_back(std::shared_ptr(SmartsToMol("[NH,NH2,NH3+]c1c([CX4])cccc1[CX4]"))); + // [56] abraham_A_frag_46 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1c(C[F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])cccc1"))); + // [57] abraham_A_frag_47 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1cc([CX3](=O)[OX1H0-,OX2H1])ccc1"))); + // [58] abraham_A_frag_48 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccc([CX3](=O)[OX1H0-,OX2H1])cc1"))); + // [59] abraham_A_frag_49 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1cc([$([CH](=O)),$(C(=O)C)])ccc1"))); + // [60] abraham_A_frag_5 + queries.push_back(std::shared_ptr(SmartsToMol("[C][NX3;H1;R][C]"))); + // [61] abraham_A_frag_50 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccc([$([CH](=O)),$(C(=O)C)])cc1"))); + // [62] abraham_A_frag_6 + queries.push_back(std::shared_ptr(SmartsToMol("[c][NX3;H1;!$(NC=O)][C]"))); + // [63] abraham_A_frag_7 + queries.push_back(std::shared_ptr(SmartsToMol("[c][nX3;H1][c]"))); + // [64] abraham_A_frag_8 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OX1H0-,OX2H1]"))); + // [65] abraham_A_frag_9 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=[OX1])[NX3;H2]"))); + // [66] abraham_BSEL_frag_0 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H3]"))); + // [67] abraham_BSEL_frag_1 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]"))); + // [68] abraham_BSEL_frag_10 + queries.push_back(std::shared_ptr(SmartsToMol("[C][NX3;H1][C]"))); + // [69] abraham_BSEL_frag_11 + queries.push_back(std::shared_ptr(SmartsToMol("[c][NX3;H1]"))); + // [70] abraham_BSEL_frag_12 + queries.push_back(std::shared_ptr(SmartsToMol("[C][NX3;H0](C)[C]"))); + // [71] abraham_BSEL_frag_13 + queries.push_back(std::shared_ptr(SmartsToMol("[c][NX3;H0](C)[C]"))); + // [72] abraham_BSEL_frag_14 + queries.push_back(std::shared_ptr(SmartsToMol("[c][nX3;H0][c]"))); + // [73] abraham_BSEL_frag_15 + queries.push_back(std::shared_ptr(SmartsToMol("*=[Nv3;!R]"))); + // [74] abraham_BSEL_frag_16 + queries.push_back(std::shared_ptr(SmartsToMol("*=[Nv3;R]"))); + // [75] abraham_BSEL_frag_17 + queries.push_back(std::shared_ptr(SmartsToMol("[nX2H0,nX3H1+](a)a"))); + // [76] abraham_BSEL_frag_18 + queries.push_back(std::shared_ptr(SmartsToMol("N#C[A;!#1]"))); + // [77] abraham_BSEL_frag_19 + queries.push_back(std::shared_ptr(SmartsToMol("N#C[a;!#1]"))); + // [78] abraham_BSEL_frag_2 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]"))); + // [79] abraham_BSEL_frag_20 + queries.push_back(std::shared_ptr(SmartsToMol("[$([A;!#1][NX3](=O)=O),$([A;!#1][NX3+](=O)[O-])]"))); + // [80] abraham_BSEL_frag_21 + queries.push_back(std::shared_ptr(SmartsToMol("[$([a;!#1][NX3](=O)=O),$([a;!#1][NX3+](=O)[O-])]"))); + // [81] abraham_BSEL_frag_22 + queries.push_back(std::shared_ptr(SmartsToMol("[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]"))); + // [82] abraham_BSEL_frag_23 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]"))); + // [83] abraham_BSEL_frag_24 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H0;!R]"))); + // [84] abraham_BSEL_frag_25 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2;H0;R]"))); + // [85] abraham_BSEL_frag_26 + queries.push_back(std::shared_ptr(SmartsToMol("[oX2](a)a"))); + // [86] abraham_BSEL_frag_27 + queries.push_back(std::shared_ptr(SmartsToMol("*=O"))); + // [87] abraham_BSEL_frag_28 + queries.push_back(std::shared_ptr(SmartsToMol("[SX2](*)*"))); + // [88] abraham_BSEL_frag_29 + queries.push_back(std::shared_ptr(SmartsToMol("[sX2](a)a"))); + // [89] abraham_BSEL_frag_3 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]"))); + // [90] abraham_BSEL_frag_30 + queries.push_back(std::shared_ptr(SmartsToMol("*=[SX1]"))); + // [91] abraham_BSEL_frag_31 + queries.push_back(std::shared_ptr(SmartsToMol("[SX3]"))); + // [92] abraham_BSEL_frag_32 + queries.push_back(std::shared_ptr(SmartsToMol("[$([#16X4](=[OX1])(=[OX1])([!#8])[OX2H0]),$([#16X4+2]([OX1-])([OX1-])([!#8])[OX2H0])]"))); + // [93] abraham_BSEL_frag_33 + queries.push_back(std::shared_ptr(SmartsToMol("[S,s]"))); + // [94] abraham_BSEL_frag_34 + queries.push_back(std::shared_ptr(SmartsToMol("[P,p]"))); + // [95] abraham_BSEL_frag_35 + queries.push_back(std::shared_ptr(SmartsToMol("FA"))); + // [96] abraham_BSEL_frag_36 + queries.push_back(std::shared_ptr(SmartsToMol("Fa"))); + // [97] abraham_BSEL_frag_37 + queries.push_back(std::shared_ptr(SmartsToMol("Cl"))); + // [98] abraham_BSEL_frag_38 + queries.push_back(std::shared_ptr(SmartsToMol("Br"))); + // [99] abraham_BSEL_frag_39 + queries.push_back(std::shared_ptr(SmartsToMol("I"))); + // [100] abraham_BSEL_frag_4 + queries.push_back(std::shared_ptr(SmartsToMol("*=[CX3H2]"))); + // [101] abraham_BSEL_frag_40 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3;!R](=[OX1])[OX2H0]"))); + // [102] abraham_BSEL_frag_41 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3;R](=[OX1])[OX2H0;R]"))); + // [103] abraham_BSEL_frag_42 + queries.push_back(std::shared_ptr(SmartsToMol("P(=[OX1])(O)(O)O"))); + // [104] abraham_BSEL_frag_43 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=[OX1])([OX2H0])[OX2H0]"))); + // [105] abraham_BSEL_frag_44 + queries.push_back(std::shared_ptr(SmartsToMol("nC=[OX1]"))); + // [106] abraham_BSEL_frag_45 + queries.push_back(std::shared_ptr(SmartsToMol("[N;!R]C=[OX1]"))); + // [107] abraham_BSEL_frag_46 + queries.push_back(std::shared_ptr(SmartsToMol("[N;R][C;R]=[OX1]"))); + // [108] abraham_BSEL_frag_47 + queries.push_back(std::shared_ptr(SmartsToMol("[$([SX4](=[OX1])(=[OX1])([!O])[NX3]),$([SX4+2]([OX1-])([OX1-])([!O])[NX3])]"))); + // [109] abraham_BSEL_frag_48 + queries.push_back(std::shared_ptr(SmartsToMol("NC(=[OX1])N"))); + // [110] abraham_BSEL_frag_49 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3,NX4+][CX3](=[OX1])[OX2,OX1-]"))); + // [111] abraham_BSEL_frag_5 + queries.push_back(std::shared_ptr(SmartsToMol("[$(*=[CX3H1]),$([cX3H1](a)a)]"))); + // [112] abraham_BSEL_frag_50 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=[OX1])[NX3][CX3](=[OX1])"))); + // [113] abraham_BSEL_frag_51 + queries.push_back(std::shared_ptr(SmartsToMol("C1(=[OX1])C=CC(=[OX1])C=C1"))); + // [114] abraham_BSEL_frag_52 + queries.push_back(std::shared_ptr(SmartsToMol("[$([CX4]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])]"))); + // [115] abraham_BSEL_frag_53 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)])[CX4][F,Cl,Br,I,$([NX3](=O)=O),$([NX3+](=O)[O-]),$(C#N),$([CX4](F)(F)F)]"))); + // [116] abraham_BSEL_frag_54 + queries.push_back(std::shared_ptr(SmartsToMol("*1~*2~*(~*3~*(~*~*~*~*3)~*1)~*~*~*1~*2~*~*~*1"))); + // [117] abraham_BSEL_frag_55 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]CC[O,N]"))); + // [118] abraham_BSEL_frag_56 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]C[C,N]=[O,S]"))); + // [119] abraham_BSEL_frag_57 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]c1ccccc1[O,Nv3]"))); + // [120] abraham_BSEL_frag_58 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]c1ccccc1C=[O,S]"))); + // [121] abraham_BSEL_frag_59 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2H]c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]"))); + // [122] abraham_BSEL_frag_6 + queries.push_back(std::shared_ptr(SmartsToMol("[$(*=[CX3H0]),$([cX3H0](a)(a)A)]"))); + // [123] abraham_BSEL_frag_60 + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([OH])[CX4][OH]"))); + // [124] abraham_BSEL_frag_61 + queries.push_back(std::shared_ptr(SmartsToMol("n:n"))); + // [125] abraham_BSEL_frag_62 + queries.push_back(std::shared_ptr(SmartsToMol("o:n"))); + // [126] abraham_BSEL_frag_63 + queries.push_back(std::shared_ptr(SmartsToMol("n:c:n"))); + // [127] abraham_BSEL_frag_64 + queries.push_back(std::shared_ptr(SmartsToMol("o:c:n"))); + // [128] abraham_BSEL_frag_65 + queries.push_back(std::shared_ptr(SmartsToMol("n:c:c:n"))); + // [129] abraham_BSEL_frag_66 + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I,N,O,S]-c:c-[F,Cl,Br,I,N,O,S]"))); + // [130] abraham_BSEL_frag_67 + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I,N,O,S]-c:c:c-[F,Cl,Br,I,N,O,S]"))); + // [131] abraham_BSEL_frag_68 + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I,N,O,S]-c:c:c:c-[F,Cl,Br,I,N,O,S]"))); + // [132] abraham_BSEL_frag_69 + queries.push_back(std::shared_ptr(SmartsToMol("P(=[OX1])N"))); + // [133] abraham_BSEL_frag_7 + queries.push_back(std::shared_ptr(SmartsToMol("c(a)(a)a"))); + // [134] abraham_BSEL_frag_70 + queries.push_back(std::shared_ptr(SmartsToMol("Nc:n"))); + // [135] abraham_BSEL_frag_71 + queries.push_back(std::shared_ptr(SmartsToMol("[$(cC[OH]);!$(c[CX3](=O)[OX1H0-,OX2H1])]"))); + // [136] abraham_BSEL_frag_72 + queries.push_back(std::shared_ptr(SmartsToMol("[$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])]"))); + // [137] abraham_BSEL_frag_73 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2]-c:c-[OX2]"))); + // [138] abraham_BSEL_frag_8 + queries.push_back(std::shared_ptr(SmartsToMol("*#C"))); + // [139] abraham_BSEL_frag_9 + queries.push_back(std::shared_ptr(SmartsToMol("[c][NX3;H2]"))); + // [140] abraham_S_additional_0 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[CX4]"))); + // [141] abraham_S_additional_1 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)c1ccccc1"))); + // [142] abraham_S_additional_10 + queries.push_back(std::shared_ptr(SmartsToMol("C#Nc1ccccc1"))); + // [143] abraham_S_additional_11 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3](=O)=Oc1ccccc1"))); + // [144] abraham_S_additional_12 + queries.push_back(std::shared_ptr(SmartsToMol("[SX4](=O)"))); + // [145] abraham_S_additional_13 + queries.push_back(std::shared_ptr(SmartsToMol("[SX4](=O)(=O)"))); + // [146] abraham_S_additional_14 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[F,Cl,Br,I]"))); + // [147] abraham_S_additional_2 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[CX4][CX4]"))); + // [148] abraham_S_additional_3 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OX2][CX4]"))); + // [149] abraham_S_additional_4 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OX2]c1ccccc1"))); + // [150] abraham_S_additional_5 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[NX3]"))); + // [151] abraham_S_additional_6 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[NX3]c1ccccc1"))); + // [152] abraham_S_additional_7 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[CX4][CX3](=O)"))); + // [153] abraham_S_additional_8 + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[CX3](=O)"))); + // [154] abraham_S_additional_9 + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[CX3](=O)c1ccccc1"))); + // [155] abraham_new_[#6](:[#6]:[#6]:[#6]-[#6]):[#6]:[#6] + queries.push_back(std::shared_ptr(SmartsToMol("[#6](:[#6]:[#6]:[#6]-[#6]):[#6]:[#6]"))); + // [156] abraham_new_[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#6] + queries.push_back(std::shared_ptr(SmartsToMol("[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#6]"))); + // [157] abraham_new_[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#8] + queries.push_back(std::shared_ptr(SmartsToMol("[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#8]"))); + // [158] abraham_new_[#6]:[#6]:[#6]:[#6]:[#6]:[#6].[#6]-[#6] + queries.push_back(std::shared_ptr(SmartsToMol("[#6]:[#6]:[#6]:[#6]:[#6]:[#6].[#6]-[#6]"))); + // [159] abraham_new_[#6]:[#6]:[#6]:[#6]:[#6]:[#6].[#6]:[#6] + queries.push_back(std::shared_ptr(SmartsToMol("[#6]:[#6]:[#6]:[#6]:[#6]:[#6].[#6]:[#6]"))); + // [160] abraham_new_[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6] + queries.push_back(std::shared_ptr(SmartsToMol("[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]"))); + // [161] abraham_new_[CX2H0]#[CX2H0] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H0]#[CX2H0]"))); + // [162] abraham_new_[CX2H0]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H0]#[CX2H]"))); + // [163] abraham_new_[CX2H0]([CX4H3])#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H0]([CX4H3])#[CX2H]"))); + // [164] abraham_new_[CX2H0]([CX4H3])([CX4H3])#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H0]([CX4H3])([CX4H3])#[CX2H]"))); + // [165] abraham_new_[CX2H]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H]#[CX2H]"))); + // [166] abraham_new_[CX2H]([CX4H3])#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX2H]([CX4H3])#[CX2H]"))); + // [167] abraham_new_[CX3H0]([CX4H3])([CX4H3])=[CX3H1] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]([CX4H3])([CX4H3])=[CX3H1]"))); + // [168] abraham_new_[CX3H0]([CX4H3])([CX4H3])=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]([CX4H3])([CX4H3])=[CX3H2]"))); + // [169] abraham_new_[CX3H0]([CX4H3])=[CX3H1] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]([CX4H3])=[CX3H1]"))); + // [170] abraham_new_[CX3H0]([CX4H3])=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]([CX4H3])=[CX3H2]"))); + // [171] abraham_new_[CX3H0]=[CX3H0] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]=[CX3H0]"))); + // [172] abraham_new_[CX3H0]=[CX3H1] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]=[CX3H1]"))); + // [173] abraham_new_[CX3H0]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H0]=[CX3H2]"))); + // [174] abraham_new_[CX3H1]([CX4H3])=[CX3H1] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H1]([CX4H3])=[CX3H1]"))); + // [175] abraham_new_[CX3H1]([CX4H3])=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H1]([CX4H3])=[CX3H2]"))); + // [176] abraham_new_[CX3H1]=[CX3H1] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H1]=[CX3H1]"))); + // [177] abraham_new_[CX3H1]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H1]=[CX3H2]"))); + // [178] abraham_new_[CX3H2]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3H2]=[CX3H2]"))); + // [179] abraham_new_[CX3](=O) + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)"))); + // [180] abraham_new_[CX3](=O)[NX3][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[NX3][CX4]"))); + // [181] abraham_new_[CX3](=O)[OX2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3](=O)[OX2]"))); + // [182] abraham_new_[CX3]=[CX3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3]=[CX3]"))); + // [183] abraham_new_[CX3]=[CX3][CX4]([CX4]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX3]=[CX3][CX4]([CX4])"))); + // [184] abraham_new_[CX3]=[CX3][CX4]([CX4])([CX4]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX3]=[CX3][CX4]([CX4])([CX4])"))); + // [185] abraham_new_[CX3]=[CX3][CX4]([CX4])[CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX3]=[CX3][CX4]([CX4])[CX4]"))); + // [186] abraham_new_[CX4H0]([CX4H2])([CX4H2])[CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H2])([CX4H2])[CX4H2][CX4H3]"))); + // [187] abraham_new_[CX4H0]([CX4H3])([CX4H2])[CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])([CX4H2])[CX4H2][CX4H3]"))); + // [188] abraham_new_[CX4H0]([CX4H3])([CX4H3])([CX4H3])([CX4H3]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])([CX4H3])([CX4H3])([CX4H3])"))); + // [189] abraham_new_[CX4H0]([CX4H3])([CX4H3])([CX4H3])[CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])([CX4H3])([CX4H3])[CX4H3]"))); + // [190] abraham_new_[CX4H0]([CX4H3])([CX4H3])[CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])([CX4H3])[CX4H2][CX4H3]"))); + // [191] abraham_new_[CX4H0]([CX4H3])[CX2H]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])[CX2H]#[CX2H]"))); + // [192] abraham_new_[CX4H0]([CX4H3])[CX3H1]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H0]([CX4H3])[CX3H1]=[CX3H2]"))); + // [193] abraham_new_[CX4H1]([CX4H3])([CX4H2])[CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])([CX4H2])[CX4H3]"))); + // [194] abraham_new_[CX4H1]([CX4H3])([CX4H3])([CX4H3]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])([CX4H3])([CX4H3])"))); + // [195] abraham_new_[CX4H1]([CX4H3])([CX4H3])[CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])([CX4H3])[CX4H3]"))); + // [196] abraham_new_[CX4H1]([CX4H3])[CX2H]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])[CX2H]#[CX2H]"))); + // [197] abraham_new_[CX4H1]([CX4H3])[CX3H1]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])[CX3H1]=[CX3H2]"))); + // [198] abraham_new_[CX4H1]([CX4H3])[CX4H2][CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])[CX4H2][CX4H2][CX4H3]"))); + // [199] abraham_new_[CX4H1]([CX4H3])[CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H1]([CX4H3])[CX4H2][CX4H3]"))); + // [200] abraham_new_[CX4H2]([CX4H3])([CX4H3]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])([CX4H3])"))); + // [201] abraham_new_[CX4H2]([CX4H3])[CX2H]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])[CX2H]#[CX2H]"))); + // [202] abraham_new_[CX4H2]([CX4H3])[CX3H1]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])[CX3H1]=[CX3H2]"))); + // [203] abraham_new_[CX4H2]([CX4H3])[CX4H2][CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])[CX4H2][CX4H2][CX4H3]"))); + // [204] abraham_new_[CX4H2]([CX4H3])[CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])[CX4H2][CX4H3]"))); + // [205] abraham_new_[CX4H2]([CX4H3])[CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H2]([CX4H3])[CX4H3]"))); + // [206] abraham_new_[CX4H3][CX2H]#[CX2H] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H3][CX2H]#[CX2H]"))); + // [207] abraham_new_[CX4H3][CX3H0]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H3][CX3H0]=[CX3H2]"))); + // [208] abraham_new_[CX4H3][CX3H1]=[CX3H2] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H3][CX3H1]=[CX3H2]"))); + // [209] abraham_new_[CX4H3][CX4H2][CX4H3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H3][CX4H2][CX4H3]"))); + // [210] abraham_new_[CX4H][CX4]([CX4])[CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4H][CX4]([CX4])[CX4]"))); + // [211] abraham_new_[CX4]([CX4])([CX4])([CX4]) + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])([CX4])([CX4])"))); + // [212] abraham_new_[CX4]([CX4])([CX4])[CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])([CX4])[CX4]"))); + // [213] abraham_new_[CX4]([CX4])[CX3]=[CX3] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX3]=[CX3]"))); + // [214] abraham_new_[CX4]([CX4])[CX3]=[CX3][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX3]=[CX3][CX4]"))); + // [215] abraham_new_[CX4]([CX4])[CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX4]"))); + // [216] abraham_new_[CX4]([CX4])[CX4]([CX4])[CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX4]([CX4])[CX4]"))); + // [217] abraham_new_[CX4]([CX4])[CX4][CX3](=O) + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX4][CX3](=O)"))); + // [218] abraham_new_[CX4]([CX4])[CX4][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX4][CX4]"))); + // [219] abraham_new_[CX4]([CX4])[CX4][OH] + queries.push_back(std::shared_ptr(SmartsToMol("[CX4]([CX4])[CX4][OH]"))); + // [220] abraham_new_[F,Cl,Br,I] + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I]"))); + // [221] abraham_new_[F,Cl,Br,I][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I][CX4]"))); + // [222] abraham_new_[F,Cl,Br,I]c1ccccc1 + queries.push_back(std::shared_ptr(SmartsToMol("[F,Cl,Br,I]c1ccccc1"))); + // [223] abraham_new_[NX3;H0] + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H0]"))); + // [224] abraham_new_[NX3;H1] + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H1]"))); + // [225] abraham_new_[NX3;H2] + queries.push_back(std::shared_ptr(SmartsToMol("[NX3;H2]"))); + // [226] abraham_new_[NX3][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[NX3][CX4]"))); + // [227] abraham_new_[NX3]c1ccccc1 + queries.push_back(std::shared_ptr(SmartsToMol("[NX3]c1ccccc1"))); + // [228] abraham_new_[OH][CX3](=O) + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX3](=O)"))); + // [229] abraham_new_[OH][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX4]"))); + // [230] abraham_new_[OH][CX4][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[OH][CX4][CX4]"))); + // [231] abraham_new_[OH]c1ccccc1 + queries.push_back(std::shared_ptr(SmartsToMol("[OH]c1ccccc1"))); + // [232] abraham_new_[OX2][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[OX2][CX4]"))); + // [233] abraham_new_[OX2][CX4][CX4] + queries.push_back(std::shared_ptr(SmartsToMol("[OX2][CX4][CX4]"))); + // [234] abraham_new_[OX2]c1ccccc1 + queries.push_back(std::shared_ptr(SmartsToMol("[OX2]c1ccccc1"))); + // [235] abraham_new_c1ccccc1 + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1"))); + // [236] abraham_new_c1ccccc1[CX3](=O) + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[CX3](=O)"))); + // [237] abraham_new_c1ccccc1[F,Cl,Br,I] + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[F,Cl,Br,I]"))); + // [238] abraham_new_c1ccccc1[NX3] + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[NX3]"))); + // [239] abraham_new_c1ccccc1[OH] + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[OH]"))); + // [240] abraham_new_c1ccccc1[OX2] + queries.push_back(std::shared_ptr(SmartsToMol("c1ccccc1[OX2]"))); + }); + + return queries; +} + +} // namespace Osmordred +} // namespace Descriptors +} // namespace RDKit diff --git a/Code/GraphMol/Descriptors/smarts291/abraham_queries.h b/Code/GraphMol/Descriptors/smarts291/abraham_queries.h new file mode 100644 index 00000000000..c555ac36003 --- /dev/null +++ b/Code/GraphMol/Descriptors/smarts291/abraham_queries.h @@ -0,0 +1,21 @@ +// Auto-generated header for Abraham query functions + +#ifndef ABRAHAM_QUERIES_H +#define ABRAHAM_QUERIES_H + +#include +#include +#include + +namespace RDKit { +namespace Descriptors { +namespace Osmordred { + +// Get 241 base feature queries in EXACT model order (alphabetically sorted by feature name) +const std::vector>& GetQueriesAbrahamBaseFeatures(); + +} // namespace Osmordred +} // namespace Descriptors +} // namespace RDKit + +#endif // ABRAHAM_QUERIES_H diff --git a/Code/GraphMol/Descriptors/smarts291/test_smarts291.cpp b/Code/GraphMol/Descriptors/smarts291/test_smarts291.cpp new file mode 100644 index 00000000000..8c5e02a87b5 --- /dev/null +++ b/Code/GraphMol/Descriptors/smarts291/test_smarts291.cpp @@ -0,0 +1,250 @@ +// Copyright (c) 2025, Guillaume Godin Osmo Labs, PBC's and others +// All rights reserved. +// +// SMARTS291 - Abraham SMARTS-based Features Unit Tests +// Tests C++ implementation against Python CalcAbrahamsFeatures golden reference + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace RDKit; + +// Test tolerance - allow 1e-5 relative or absolute difference +const double TOLERANCE = 1e-5; + +// Check if two values match within tolerance +bool valuesMatch(double computed, double golden) { + // Both NaN = match + if (std::isnan(computed) && std::isnan(golden)) return true; + // One NaN = no match + if (std::isnan(computed) || std::isnan(golden)) return false; + // Both Inf with same sign = match + if (std::isinf(computed) && std::isinf(golden)) { + return (computed > 0) == (golden > 0); + } + // One Inf = no match + if (std::isinf(computed) || std::isinf(golden)) return false; + + // Absolute difference check + double absDiff = std::abs(computed - golden); + if (absDiff < TOLERANCE) return true; + + // Relative difference check + double maxVal = std::max(std::abs(computed), std::abs(golden)); + if (maxVal > 0 && absDiff / maxVal < TOLERANCE) return true; + + return false; +} + +TEST_CASE("SMARTS291 Basic Functionality", "[smarts291][basic]") { + SECTION("Ethanol features") { + ROMol* mol = SmilesToMol("CCO"); + REQUIRE(mol != nullptr); + + std::vector features = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + + // Should return 291 features (241 base + 50 golden) + CHECK(features.size() == 291); + + // Features should not be all zeros for a real molecule + double sum = 0; + for (double f : features) { + if (!std::isnan(f)) sum += std::abs(f); + } + CHECK(sum > 0); + + delete mol; + } + + SECTION("Base features extraction") { + ROMol* mol = SmilesToMol("c1ccccc1"); // Benzene + REQUIRE(mol != nullptr); + + std::vector baseFeatures = Descriptors::Osmordred::extractAbrahamBaseFeatures(*mol); + + // Should return exactly 241 base features + CHECK(baseFeatures.size() == 241); + + delete mol; + } +} + +TEST_CASE("SMARTS291 Batch Processing", "[smarts291][batch]") { + std::vector smiles = {"CCO", "CC(=O)C", "c1ccccc1", "CCN"}; + + for (const auto& smi : smiles) { + ROMol* mol = SmilesToMol(smi); + REQUIRE(mol != nullptr); + + std::vector features = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + CHECK(features.size() == 291); + + delete mol; + } +} + +TEST_CASE("SMARTS291 NCI Golden Reference Test", "[smarts291][golden]") { + // Locate the golden reference file + std::string rdbase = std::getenv("RDBASE") ? std::getenv("RDBASE") : ""; + std::string golden_path; + + if (!rdbase.empty()) { + golden_path = rdbase + "/Code/GraphMol/Descriptors/test_data/nci_100_smarts291_golden.csv"; + } else { + // Try relative path + golden_path = "Code/GraphMol/Descriptors/test_data/nci_100_smarts291_golden.csv"; + } + + std::ifstream golden_file(golden_path); + if (!golden_file.is_open()) { + // Try current directory + golden_path = "test_data/nci_100_smarts291_golden.csv"; + golden_file.open(golden_path); + } + + // If golden file not found, skip this test + if (!golden_file.is_open()) { + WARN("Golden file not found at " << golden_path << " - skipping golden reference test"); + return; + } + + // Parse header to get number of descriptors + std::string header; + std::getline(golden_file, header); + + // Count columns (smiles + 291 features) + size_t n_descriptors = 0; + { + std::stringstream ss(header); + std::string col; + while (std::getline(ss, col, ',')) { + n_descriptors++; + } + n_descriptors--; // Subtract SMILES column + } + + INFO("Golden file has " << n_descriptors << " descriptors"); + REQUIRE(n_descriptors == 291); + + int validated = 0; + int mismatches = 0; + std::string line; + + while (std::getline(golden_file, line) && validated < 100) { + std::stringstream ss(line); + std::string smiles; + std::getline(ss, smiles, ','); + + // Parse golden values + std::vector golden_values; + std::string value; + while (std::getline(ss, value, ',')) { + try { + if (value.empty() || value == "nan" || value == "NaN") { + golden_values.push_back(std::nan("")); + } else if (value == "inf") { + golden_values.push_back(std::numeric_limits::infinity()); + } else if (value == "-inf") { + golden_values.push_back(-std::numeric_limits::infinity()); + } else { + golden_values.push_back(std::stod(value)); + } + } catch (...) { + golden_values.push_back(std::nan("")); + } + } + + if (golden_values.size() != 291) { + WARN("Skipping line with " << golden_values.size() << " values (expected 291)"); + continue; + } + + // Compute SMARTS291 features + ROMol* mol = SmilesToMol(smiles); + if (!mol) { + WARN("Could not parse SMILES: " << smiles); + continue; + } + + std::vector computed = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + delete mol; + + if (computed.size() != 291) { + WARN("Computed features have wrong size: " << computed.size()); + continue; + } + + // Compare each feature + for (size_t i = 0; i < 291; i++) { + if (!valuesMatch(computed[i], golden_values[i])) { + mismatches++; + if (mismatches <= 10) { // Only log first 10 mismatches + WARN("Mismatch at molecule " << validated << " (" << smiles << ") feature " << i + << ": computed=" << std::setprecision(10) << computed[i] + << " golden=" << golden_values[i]); + } + } + } + + validated++; + } + + golden_file.close(); + + INFO("Validated " << validated << " molecules"); + INFO("Total mismatches: " << mismatches << " out of " << (validated * 291) << " values"); + + // Require at least 90 molecules validated + REQUIRE(validated >= 90); + + // Allow up to 5% mismatch (some SMARTS patterns may have minor differences) + double mismatch_rate = static_cast(mismatches) / (validated * 291); + INFO("Mismatch rate: " << (mismatch_rate * 100) << "%"); + REQUIRE(mismatch_rate < 0.05); +} + +TEST_CASE("SMARTS291 Edge Cases", "[smarts291][edge]") { + SECTION("Single atom molecule") { + ROMol* mol = SmilesToMol("C"); // Methane + REQUIRE(mol != nullptr); + + std::vector features = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + CHECK(features.size() == 291); + + delete mol; + } + + SECTION("Large molecule") { + // Cholesterol-like structure + ROMol* mol = SmilesToMol("CC(C)CCCC(C)C1CCC2C1(CCC3C2CC=C4C3(CCC(C4)O)C)C"); + REQUIRE(mol != nullptr); + + std::vector features = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + CHECK(features.size() == 291); + + delete mol; + } + + SECTION("Molecule with heteroatoms") { + ROMol* mol = SmilesToMol("c1ccc(N)c(O)c1S"); // Amino-thiophenol + REQUIRE(mol != nullptr); + + std::vector features = Descriptors::Osmordred::calcAbrahamsFeatures(*mol); + CHECK(features.size() == 291); + + delete mol; + } +}