diff --git a/src/libraries/UTILITIES/AmplitudeParser.cc b/src/libraries/UTILITIES/AmplitudeParser.cc new file mode 100644 index 000000000..6b999eda6 --- /dev/null +++ b/src/libraries/UTILITIES/AmplitudeParser.cc @@ -0,0 +1,259 @@ +/** + * @file AmplitudeParser.cc + * @author Kevin Scheuer + * @brief Implementation of the amplitude naming parser and sum grouping logic. + * @date 2026-05-08 + */ + +#include +#include +#include + +#include "AmplitudeParser.h" +#include "IUAmpTools/report.h" + +const char *AmplitudeParser::kModule = "AmplitudeParser"; + +AmplitudeParser::AmplitudeParser(const std::string &naming_scheme) : m_requested_scheme(namingSchemeFromString(naming_scheme)) {} + +AmplitudeParser::NamingScheme +AmplitudeParser::namingSchemeFromString(const std::string &scheme_name) +{ + if (scheme_name == "JLme") + return NamingScheme::JLme; + if (scheme_name == "eJPmL") + return NamingScheme::eJPmL; + if (scheme_name == "Lme") + return NamingScheme::Lme; + return NamingScheme::Auto; +} + +AmplitudeParser::NamingScheme +AmplitudeParser::inferNamingScheme(const std::string &_name) +{ + if (amp_name.empty()) + { + report(ERROR, kModule) << "Amplitude name is empty, cannot infer naming scheme.\n"; + assert(false); + } + + if (std::isdigit(static_cast(amp_name.front()))) + return NamingScheme::JLme; + + if (std::isupper(static_cast(amp_name.front())) && + (amp_name.back() == '+' || amp_name.back() == '-')) + return NamingScheme::Lme; + + if ((amp_name.front() == 'p' || amp_name.front() == 'm') && + std::isupper(static_cast(amp_name.back()))) + return NamingScheme::eJPmL; + + return NamingScheme::Auto; // default if we can't infer +} + +std::vector +AmplitudeParser::filterAmplitudesByScheme( + const std::vector &full_amplitudes, + NamingScheme scheme) const +{ + std::vector filtered; + for (const auto &full_amp : full_amplitudes) + { + const std::string amp_name = ampName(full_amp); + if (inferNamingScheme(amp_name) == scheme) + filtered.push_back(full_amp); + else + { + report(WARNING, kModule) << "Amplitude '" + << full_amp + << "' does not match the inferred naming scheme and will be excluded from sum groups.\n"; + } + } + return filtered; +} + +std::string AmplitudeParser::ampName(const std::string &full_name) +{ + const size_t pos = full_name.rfind("::"); + return (pos == std::string::npos) ? full_name : full_name.substr(pos + 2); +} + +AmplitudeParser::ParsedAmplitude +AmplitudeParser::parseAmplitudeName(const std::string &_name, NamingScheme scheme) const +{ + ParsedAmplitude parsed; + parsed.amp_name = amp_name; + + // Parse quantum numbers according to the structure of the naming scheme + switch (scheme) + { + case NamingScheme::JLme: + parsed.J = amp_name.substr(0, 1); + parsed.L = amp_name.substr(1, 1); + parsed.m = amp_name.substr(2, amp_name.size() - 3); + parsed.e = amp_name.substr(amp_name.size() - 1, 1); + break; + case NamingScheme::Lme: + parsed.L = amp_name.substr(0, 1); + parsed.m = amp_name.substr(1, amp_name.size() - 2); + parsed.e = amp_name.substr(amp_name.size() - 1, 1); + break; + case NamingScheme::eJPmL: + parsed.e = amp_name.substr(0, 1); + parsed.J = amp_name.substr(1, 1); + parsed.P = amp_name.substr(2, 1); + parsed.m = amp_name.substr(3, amp_name.size() - 4); + parsed.L = amp_name.substr(amp_name.size() - 1, 1); + break; + default: + report(WARNING, kModule) << "Unknown naming scheme, cannot parse amplitude name '" << amp_name << "'.\n"; + break; + } + + return parsed; +} + +std::vector +AmplitudeParser::sumsForScheme(NamingScheme scheme) const +{ + if (scheme == NamingScheme::Lme) + { + return { + {{'e'}}, + {{'L'}}, + {{'e', 'L'}}, + {{'L', 'm'}}, + }; + } + + if (scheme == NamingScheme::JLme) + { + return { + {{'e'}}, + {{'J'}}, + {{'J', 'e'}}, + {{'J', 'L'}}, + {{'J', 'L', 'e'}}, + {{'J', 'L', 'm'}}, + }; + } + if (scheme == NamingScheme::eJPmL) + { + return { + {{'e'}}, + {{'J', 'P'}}, + {{'e', 'J', 'P'}}, + {{'J', 'P', 'L'}}, + {{'e', 'J', 'P', 'L'}}, + {{'J', 'P', 'm', 'L'}}, + }; + } + else + { + report(WARNING, kModule) << "Unknown naming scheme, cannot determine sum groups.\n"; + return {}; + } +} + +std::string AmplitudeParser::buildSumLabel( + const SumGroupDef &sum_group, + const ParsedAmplitude &parsed) const +{ + std::string label; + for (const auto &qn : sum_group.quantum_numbers) + { + std::string value = ""; + switch (qn) + { + case 'e': + value = parsed.e; + break; + case 'J': + value = parsed.J; + break; + case 'P': + value = parsed.P; + break; + case 'm': + value = parsed.m; + break; + case 'L': + value = parsed.L; + break; + default: + report(WARNING, kModule) << "Unknown quantum number '" << qn << "' in sum group definition.\n"; + break; + } + label += value; + } + return label; +} + +std::map> +AmplitudeParser::buildSumGroups(const std::vector &full_amplitudes) const +{ + std::map> groups; + + NamingScheme effective_scheme = m_requested_scheme; + + if (full_amplitudes.empty()) + { + report(WARNING, kModule) << "No amplitudes found in fit results, so no sums will be built.\n"; + return groups; + } + + // "auto" means we need to infer the naming scheme from the amplitude labels. + // We'll use the first amplitude label as a sample to infer the scheme. + if (effective_scheme == NamingScheme::Auto) + effective_scheme = inferNamingScheme(ampName(full_amplitudes.front())); + + // If we still can't infer a scheme, warn the user and don't attempt to build sums + // NOTE: if you have gotten this message and would like to build your own scheme, + // see the header file for instructions to do so. + if (effective_scheme == NamingScheme::Auto) + { + report(WARNING, kModule) << "Amplitude naming scheme could not be inferred, and so no sums will be built.\n"; + report(WARNING, kModule) << "This likely means your amplitude names don't follow a recognized pattern. Supported patterns include:\n"; + report(WARNING, kModule) << "\t1) JLme: e.g. 1P-1+ \n"; + report(WARNING, kModule) << "\t2) Lme: e.g. P1+ \n"; + report(WARNING, kModule) << "\t3) eJPmL: e.g. p1ppS \n"; + report(WARNING, kModule) << "You can also specify the naming scheme pattern (from the choices above) directly in the FitConverter constructor to avoid inference.\n"; + return groups; + } + + // Ensure that all amplitudes we attempt to group follow the same naming scheme, + // otherwise we will get undefined behavior when parsing the amplitude names. If we + // find any amplitudes that don't match the effective scheme, we will skip them and + // warn the user. + std::vector amplitudes_in_scheme = filterAmplitudesByScheme(full_amplitudes, effective_scheme); + + // determine what sum groups will be made + const std::vector sum_groups = sumsForScheme(effective_scheme); + + // loop over all amplitudes across all reactions and sums, parse the amplitude name, + // and group according to the requested scheme + for (const auto &full_amp : full_amplitudes) + { + const ParsedAmplitude parsed = parseAmplitudeName(ampName(full_amp), effective_scheme); + + for (const auto &sum_group : sum_groups) + { + // create the label that will be used as the CSV header + const std::string label = buildSumLabel(sum_group, parsed); + if (label.empty()) + continue; + + // add the full amplitude name that belongs to this sum group in the map + groups[label].push_back(full_amp); + } + } + + // remove duplicates from the sum groups + for (auto &pair : groups) + { + std::sort(pair.second.begin(), pair.second.end()); + pair.second.erase(std::unique(pair.second.begin(), pair.second.end()), pair.second.end()); + } + + return groups; +} \ No newline at end of file diff --git a/src/libraries/UTILITIES/AmplitudeParser.h b/src/libraries/UTILITIES/AmplitudeParser.h new file mode 100644 index 000000000..9dcf83b06 --- /dev/null +++ b/src/libraries/UTILITIES/AmplitudeParser.h @@ -0,0 +1,220 @@ +/** + * @file AmplitudeParser.h + * @author Kevin Scheuer + * @brief Helpers for parsing amplitude labels and building sum groups. + * @date 2026-05-08 + * + * @page How to Build Your A Custom Naming Scheme + * The AmplitudeParser is designed to be flexible and support different amplitude naming + * schemes. If your amplitudes follow a different naming convention than the ones + * currently supported, you can easily extend the AmplitudeParser to accommodate your + * scheme. + * + * Add the label for your naming scheme in the NamingScheme enum, and then implement + * the parsing logic in the inferNamingScheme() and parseAmplitudeName() methods. Build + * your desired sum groups in sumsForScheme(), and make sure your label will be + * properly made in buildSumLabel(). The parser will then be able to extract the + * relevant quantum numbers from your amplitude names and group them into sums according + * to your definitions. + * + * It's very important that your scheme has unique identifiers for the sums you want to + * build. For example, if you want to build sums over all amplitudes with the same J, + * and another sum just over amplitudes with the same m, you need to ensure the J and + * m values can be uniquely extracted from the amplitude name. If both use digits, + * then the parser will not know which digits correspond to J and which correspond to m. + * In that case, you would need to modify the amplitude parser to look for specific + * delimiters or patterns in the amplitude name to extract the quantum numbers + * correctly. + * + */ + +#ifndef FIT_AMPLITUDE_PARSER_H +#define FIT_AMPLITUDE_PARSER_H + +#include +#include +#include + +/** + * @class AmplitudeParser + * @brief Parses amplitude names and builds coherent sums based on naming schemes + * + * This class is responsible for parsing amplitude names from AmpTools fit results and + * grouping them into coherent sums based on shared quantum numbers. It supports + * multiple naming schemes, which can be selected by the user or inferred from the + * amplitude labels. The parser identifies the relevant quantum numbers from the + * amplitude names and groups amplitudes that share those quantum numbers into sums, + * which can then be used to calculate intensities of sums e.g. total reflectivity + * contributions, or total spin contributions. + * + * @note All amplitudes that match the criteria for a sum group will be included, + * meaning amplitudes will be gathered across reactions and sums as long as they share + * the relevant quantum numbers based on the selected naming scheme. For example, if we + * have the following amplitudes in the "eJPmL" naming scheme: + * - "reaction1::sum1::p1ppS" + * - "reaction1::sum2::p1ppS" + * - "reaction2::sum1::p1ppS" + * Then all three of these amplitudes would be included in the same sum group "JPL" + * because they all share the same J, P, and L values. + * + */ +class AmplitudeParser +{ +public: + enum class NamingScheme + { + Auto, + JLme, + eJPmL, + Lme + }; + + /** + * @brief Construct a new Amplitude Parser object + * + * @param naming_scheme what amplitude naming scheme to use for parsing amplitude names and building sum groups. + * Supported schemes include: + * - "auto": the parser will attempt to infer the naming scheme from the amplitude labels + * - "JLme": amp names like "1P-1+" (the one recommended for most amplitude analyses) + * - "eJPmL": amp names like "p1ppS" (a common vector-pseudoscalar naming scheme) + * - "Lme": amp names like "P1+" (a common two-pseudoscalar naming scheme) + */ + explicit AmplitudeParser(const std::string &naming_scheme = "auto"); + + /** + * @brief + * + * @param full_amplitudes + * @return std::map> + */ + std::map> + buildSumGroups(const std::vector &full_amplitudes) const; + +private: + /** + * @brief struct to contain the parsed quantum numbers from an amplitude name + * + * For example: + * "reaction::sum::p1ppS" would be parsed to: + * amp_name = "p1ppS" + * e = "p" + * J = "1" + * P = "p" + * m = "p" + * L = "S" + * + * "reaction::sum::2P1+" would be parsed to: + * amp_name = "2P1+" + * e = "+" + * J = "2" + * L = "P" + * m = "1" + * P is not included in this naming scheme, so it would be empty. + */ + struct ParsedAmplitude + { + std::string amp_name; ///< The "ampName" in the full "reaction::sum::ampName" + std::string e; ///< Reflectivity quantum number + std::string J; ///< Total angular momentum + std::string P; ///< Parity (sometimes excluded as its unnecessary) + std::string m; ///< Spin projection + std::string L; ///< Orbital angular momentum + }; + + NamingScheme m_requested_scheme; ///< The naming scheme to use, or "Auto" to infer from amplitude labels + + /** + * @brief Set the naming scheme based on the input string from the user. + * + * If the input is "auto" or is an unrecognized string, we will attempt to infer the + * naming scheme from the amplitude labels later. + * + * @param scheme_name user string indicating the naming scheme + * @return NamingScheme objective naming scheme enum value + */ + static NamingScheme namingSchemeFromString(const std::string &scheme_name); + + /** + * @brief Infer the naming scheme from the amplitude name format if "auto" is selected. + * + * @param amp_name the part of the full amplitude string after the last "::" + * + * @throw ERROR if the name is empty and we cannot infer a scheme + * + * @return NamingScheme::JLme if the name starts with a digit + * NamingScheme::Lme if the name starts with an uppercase letter and ends with + or - + * NamingScheme::eJPmL if the name starts with "p" or "m" and ends with an uppercase letter + * NamingScheme::Auto if we cannot infer a scheme + */ + static NamingScheme inferNamingScheme(const std::string &_name); + + /** + * @brief Filter amplitudes to belong to the chosen naming scheme + * + * To avoid undefined behavior when parsing amplitude names, we need to ensure that + * all amplitudes we attempt to group follow the same naming scheme. Any amps not + * belonging to the scheme will be reported in a printed warning, and excluded from + * the returned list. + * + * @param full_amplitudes all amplitudes from a fit result, in their full "reaction::sum::ampName" form + * @param scheme naming scheme to filter by + * @return std::vector full amplitudes that match the naming scheme + */ + std::vector filterAmplitudesByScheme(const std::vector &full_amplitudes, NamingScheme scheme) const; + + /** + * @brief Extract the amplitude name from the full amplitude string. + * + * For example, for "reaction::sum::amp", this would return "amp". This is the part + * of the amplitude label that typically contains the quantum numbers and is used + * for grouping into sums. + * + * @param full_name "reaction::sum::ampName" style amplitude string + * @return std::string ampName, the part of the full amplitude string after the last "::" + */ + static std::string ampName(const std::string &full_name); + + /** + * @brief Extract quantum numbers from the amplitude name according to the scheme + * + * @param amp_name ampName in "reaction::sum::ampName" + * @param scheme naming scheme to use for parsing + * @return ParsedAmplitude object that contains the original amp_name and the parsed quantum numbers + */ + ParsedAmplitude parseAmplitudeName(const std::string &_name, NamingScheme scheme) const; + + /** + * @brief Define the grouping of quantum numbers that will become a sum label + */ + struct SumGroupDef + { + std::vector quantum_numbers; ///< Quantum number fields to concatenate for the group label + }; + + /** + * @brief Get the sum definitions for a given naming scheme + * + * A sum label is denoted by an absence of a quantum number in the amplitude + * naming scheme. For example, the JLme scheme will have a "JL" sum group that + * includes all amplitudes with the same J and L, but any m and e. Since the + * ordering of the quantum numbers is dependent on the scheme, the group labels are + * thus also scheme-dependent. + * + * @param scheme naming scheme to get group definitions for + * @return std::vector + */ + std::vector sumsForScheme(NamingScheme scheme) const; + + /** + * @brief concatenates the quantum numbers for a SumGroupDef into a string label + * + * @param sum_group a grouping of quantum numbers to concatenate into a sum label, e.g. J and L + * @param parsed a parsed amplitude name + * @return std::string the label to be used as the key in sum map e.g. "1P" for a JL group with J=1 and L=P + */ + std::string buildSumLabel(const SumGroupDef &sum_group, const ParsedAmplitude &parsed) const; + + static const char *kModule; +}; + +#endif // FIT_AMPLITUDE_PARSER_H \ No newline at end of file diff --git a/src/libraries/UTILITIES/FitConverter.cc b/src/libraries/UTILITIES/FitConverter.cc new file mode 100644 index 000000000..4f04872d6 --- /dev/null +++ b/src/libraries/UTILITIES/FitConverter.cc @@ -0,0 +1,599 @@ +/** + * @file FitConverter.cc + * @author Kevin Scheuer + * @brief Implementation of FitConverter class for converting AmpTools fit results to CSV + * @date 2026-05-08 + * + */ + +#include +#include +#include + +#include "FitConverter.h" +#include "IUAmpTools/ConfigurationInfo.h" +#include "IUAmpTools/FitResults.h" +#include "IUAmpTools/NormIntInterface.h" +#include "IUAmpTools/report.h" + +const char *FitConverter::kModule = "FitConverter"; + +FitConverter::FitConverter(const std::string &filename, const bool &acceptance_correct, bool mute_warning, const std::string &naming_scheme) : m_fit_file(filename), + m_fit_results(filename, mute_warning), + m_cfg_info(m_fit_results.configInfo()), + m_is_acceptance_corrected(acceptance_correct), + m_amplitude_parser(naming_scheme), + m_error_matrix(m_fit_results.errorMatrix()) +{ + if (!m_fit_results.valid()) + { + report(ERROR, kModule) << "FitConverter ERROR: Unable to read fit results from file " + m_fit_file << "\n"; + assert(false); + } + extract(); +} + +void FitConverter::extract() +{ + // Store the standard AmpTools fit outputs + m_standard_results["eMatrixStatus"] = m_fit_results.eMatrixStatus(); + m_standard_results["lastMinuitCommandStatus"] = m_fit_results.lastMinuitCommandStatus(); + m_standard_results["likelihood"] = m_fit_results.likelihood(); + m_standard_results["intensity"] = m_fit_results.intensity(false).first; + m_standard_results["intensity_err"] = m_fit_results.intensity(false).second; + m_standard_results["ac_intensity"] = m_fit_results.intensity(true).first; + m_standard_results["ac_intensity_err"] = m_fit_results.intensity(true).second; + m_standard_results["estDistToMinimum"] = m_fit_results.estDistToMinimum(); + for (const auto &pair : m_standard_results) + { + report(DEBUG, kModule) << "Standard result: " << pair.first << " = " << pair.second << "\n"; + } + + // Store parameters, but ignore production coefficients (typically have "::" in name) + for (const auto &par_name : m_fit_results.parNameList()) + { + if (par_name.find("::") == std::string::npos) + m_parameters[par_name] = {m_fit_results.parValue(par_name), m_fit_results.parError(par_name)}; + } + for (const auto &pair : m_parameters) + { + report(DEBUG, kModule) << "Parameter: " << pair.first << " = " << pair.second.first << " +/- " << pair.second.second << "\n"; + } + + // Build map of unique amplitude names and their constrained amplitudes + m_constrained_amps = findConstrainedAmps(); + for (const auto &pair : m_constrained_amps) + { + report(DEBUG, kModule) << "Unique amplitude: " + << pair.first + << " has constrained amplitudes:\n"; + for (const auto & : pair.second) + report(DEBUG, kModule) << "\t" << amp << "\n"; + } + + // Calculate unique amplitude intensities + for (const auto &pair : m_constrained_amps) + { + const std::string &unique_amp = pair.first; + const std::vector &full_amps = pair.second; + + std::pair intensity = m_fit_results.intensity(full_amps, m_is_acceptance_corrected); + m_unique_amp_intensities[unique_amp] = intensity; + + report(DEBUG, kModule) << "Unique amplitude intensity for " + << unique_amp + << " = " + << intensity.first + << " +/- " + << intensity.second + << "\n"; + } + + // Build coherent-sum groups based on the selected amplitude naming scheme. + const std::map> coherent_sum_groups = + m_amplitude_parser.buildSumGroups(m_fit_results.ampList()); + + for (const auto &pair : coherent_sum_groups) + { + m_coherent_sum_intensities[pair.first] = m_fit_results.intensity(pair.second, m_is_acceptance_corrected); + + report(DEBUG, kModule) << "Coherent sum group: " << pair.first << " includes amplitudes:\n"; + for (const auto & : pair.second) + report(DEBUG, kModule) << "\t" << amp << "\n"; + + report(DEBUG, kModule) << "Coherent sum intensity for " + << pair.first + << " = " + << m_coherent_sum_intensities[pair.first].first + << " +/- " + << m_coherent_sum_intensities[pair.first].second + << "\n"; + } + + // Extract production coefficients for each unique amplitude name + for (const auto &pair : m_constrained_amps) + { + // any full amplitude name will do, they are all constrained so their + // production coefficients are identical + const std::string &unique_amp = pair.first; + if (pair.second.empty()) + continue; + + const std::string &full_amp_name = pair.second[0]; + m_production_coefficients[unique_amp] = m_fit_results.scaledProductionParameter(full_amp_name); + } + + // Calculate phase differences between all pairs of unique amplitude names that + // share the same "reaction::sum" + for (const auto &pair : m_constrained_amps) + { + const std::string &1_name = pair.first; + const std::vector &1_full_names = pair.second; + + for (const auto &2_pair : m_constrained_amps) + { + const std::string &2_name = amp2_pair.first; + if (amp1_name >= amp2_name) + continue; // avoid self-comparison and symmetric duplicates + const std::vector &2_full_names = amp2_pair.second; + + // this will ensure we only calculate one phase difference per unique amplitude pair + bool found_phase_diff = false; + + // search for shared reaction::sum between amp1 and amp2 + for (const auto &full_amp1 : amp1_full_names) + { + for (const auto &full_amp2 : amp2_full_names) + { + // check if they share reaction::sum + size_t pos1 = full_amp1.find("::", 0); + size_t pos2 = full_amp1.find("::", pos1 + 2); + std::string reaction_sum1 = full_amp1.substr(0, pos2); + + pos1 = full_amp2.find("::", 0); + pos2 = full_amp2.find("::", pos1 + 2); + std::string reaction_sum2 = full_amp2.substr(0, pos2); + + if (reaction_sum1 == reaction_sum2) + { + // they share a coherent sum, so calculate phase difference + std::pair phase_diff = m_fit_results.phaseDiff(full_amp1, full_amp2); + m_phase_differences[{full_amp1, full_amp2}] = phase_diff; + + report(DEBUG, kModule) << "Phase difference between " + << full_amp1 + << " and " + << full_amp2 + << " = " + << phase_diff.first + << " +/- " + << phase_diff.second + << "\n"; + + found_phase_diff = true; + break; // break inner amp2 loop + } + } + if (found_phase_diff) + break; // break outer amp1 loop + } + } + } +} + +std::set FitConverter::uniqueAmpNames() const +{ + std::set unique_names; + for (const auto &litude : m_fit_results.ampList()) + { + size_t last_colon = amplitude.rfind("::"); + if (last_colon != std::string::npos) + { + std::string amp_name = amplitude.substr(last_colon + 2); + unique_names.insert(amp_name); + } + } + return unique_names; +} + +std::set FitConverter::uniqueSumAmpNames() const +{ + std::set unique_names; + for (const auto &litude : m_fit_results.ampList()) + { + size_t first_colon = amplitude.find("::"); + size_t last_colon = amplitude.rfind("::"); + if (first_colon != std::string::npos && last_colon != std::string::npos && first_colon != last_colon) + { + std::string sum_amp_name = amplitude.substr(first_colon + 2, last_colon - first_colon - 2); + unique_names.insert(sum_amp_name); + } + } + return unique_names; +} + +std::string FitConverter::getCSVHeader() const +{ + std::string header = "file,"; + // standard results + for (const auto &pair : m_standard_results) + { + header += pair.first + ","; + } + // parameters + for (const auto &pair : m_parameters) + { + header += pair.first + "," + pair.first + "_err,"; + } + // production coefficients + for (const auto &pair : m_production_coefficients) + { + header += pair.first + "_re," + pair.first + "_im,"; + } + // unique amplitude intensities + for (const auto &pair : m_unique_amp_intensities) + { + header += pair.first + "," + pair.first + "_err,"; + } + // coherent sum intensities + for (const auto &pair : m_coherent_sum_intensities) + { + header += pair.first + "," + pair.first + "_err,"; + } + // phase differences + for (const auto &pair : m_phase_differences) + { + std::string full_amp1 = pair.first.first; + std::string full_amp2 = pair.first.second; + + // extract unique amplitude names from full amplitude strings + size_t last_colon1 = full_amp1.rfind("::"); + size_t last_colon2 = full_amp2.rfind("::"); + std::string amp1 = (last_colon1 != std::string::npos) ? full_amp1.substr(last_colon1 + 2) : full_amp1; + std::string amp2 = (last_colon2 != std::string::npos) ? full_amp2.substr(last_colon2 + 2) : full_amp2; + + header += amp1 + "_" + amp2 + "," + amp1 + "_" + amp2 + "_err,"; + } + // remove trailing comma + if (!header.empty() && header.back() == ',') + { + header.pop_back(); + } + return header; +} + +std::string FitConverter::getCSVRow() const +{ + std::string row = m_fit_file + ","; + // standard results + for (const auto &pair : m_standard_results) + { + row += std::to_string(pair.second) + ","; + } + // parameters + for (const auto &pair : m_parameters) + { + row += std::to_string(pair.second.first) + "," + std::to_string(pair.second.second) + ","; + } + // production coefficients + for (const auto &pair : m_production_coefficients) + { + row += std::to_string(pair.second.real()) + "," + std::to_string(pair.second.imag()) + ","; + } + // unique amplitude intensities + for (const auto &pair : m_unique_amp_intensities) + { + row += std::to_string(pair.second.first) + "," + std::to_string(pair.second.second) + ","; + } + // coherent sum intensities + for (const auto &pair : m_coherent_sum_intensities) + { + row += std::to_string(pair.second.first) + "," + std::to_string(pair.second.second) + ","; + } + // phase differences + for (const auto &pair : m_phase_differences) + { + row += std::to_string(pair.second.first) + "," + std::to_string(pair.second.second) + ","; + } + // remove trailing comma + if (!row.empty() && row.back() == ',') + { + row.pop_back(); + } + return row; +} + +std::string FitConverter::getCSVCovarianceMatrixHeader() const +{ + std::string header = "file,parameter"; + for (const auto &par : m_fit_results.parNameList()) + { + header += "," + par; + } + return header; +} + +std::string FitConverter::getCSVCorrelationMatrixHeader() const +{ + return getCSVCovarianceMatrixHeader(); // same header format +} + +std::string FitConverter::getCSVCovarianceMatrix() const +{ + std::string csv; + for (long unsigned int row = 0; row < m_error_matrix.size(); row++) + { + const std::string row_par = m_fit_results.parNameList()[row]; + csv += m_fit_file + "," + row_par; + + for (long unsigned int col = 0; col < m_error_matrix[row].size(); col++) + { + const std::string col_par = m_fit_results.parNameList()[col]; + + // double check that our indexing is correct + if (m_error_matrix[row][col] != m_fit_results.covariance(row_par, col_par)) + { + report(ERROR, kModule) << "Mismatch in covariance values between parameters " + << row_par << " and " << col_par << ".\n"; + assert(false); + } + + csv += "," + std::to_string(m_error_matrix[row][col]); + } + + // don't add newline to last row, to avoid extra blank line at end of file + // and to match the style of the other csv functions + if (row != m_error_matrix.size() - 1) + csv += "\n"; + } + return csv; +} + +std::string FitConverter::getCSVCorrelationMatrix() const +{ + std::string csv; + for (long unsigned int row = 0; row < m_error_matrix.size(); row++) + { + const std::string row_par = m_fit_results.parNameList()[row]; + const double row_par_error = m_fit_results.parError(row_par); + csv += m_fit_file + "," + row_par; + + for (long unsigned int col = 0; col < m_error_matrix[row].size(); col++) + { + const std::string col_par = m_fit_results.parNameList()[col]; + const double col_par_error = m_fit_results.parError(col_par); + + // double check that our indexing is correct + if (m_error_matrix[row][col] != m_fit_results.covariance(row_par, col_par)) + { + report(ERROR, kModule) << "Mismatch in covariance values between parameters " + << row_par << " and " << col_par << ".\n"; + assert(false); + } + + // avoid division by zero + if (row_par_error == 0.0 || col_par_error == 0.0) + { + csv += ",0.0"; // set correlation to 0 if error is zero + continue; + } + + double correlation = m_error_matrix[row][col] / (row_par_error * col_par_error); + csv += "," + std::to_string(correlation); + } + + // don't add newline to last row, to avoid extra blank line at end of file + // and to match the style of the other csv functions + if (row != m_error_matrix.size() - 1) + csv += "\n"; + } + return csv; +} + +std::string FitConverter::getCSVNormIntMatrixHeader() const +{ + std::string header = "file,amplitude"; + for (const auto & : m_fit_results.ampList()) + { + header += "," + amp; + } + return header; +} + +std::string FitConverter::getCSVNormIntMatrix() const +{ + std::string csv; + + // build the norm_int_map of reactions -> normIntInterfaces + std::map norm_int_map; + for (const std::string &reaction : m_fit_results.reactionList()) + { + const NormIntInterface *normInt = m_fit_results.normInt(reaction); + if (normInt) + { + norm_int_map[reaction] = normInt; + } + else + { + report(ERROR, kModule) << "Could not retrieve normalization integral interface for reaction " << reaction << "\n"; + assert(false); + } + } + + std::vector amp_list = m_fit_results.ampList(); + for (size_t row_idx = 0; row_idx < m_fit_results.ampList().size(); row_idx++) + { + const std::string &row_amp = amp_list.at(row_idx); + std::string row_reaction = getReactionString(row_amp); + + csv += m_fit_file + "," + row_amp; // write file name and this amplitude name as first two columns of this row + + for (size_t col_idx = 0; col_idx < m_fit_results.ampList().size(); col_idx++) + { + const std::string &col_amp = amp_list.at(col_idx); + std::string col_reaction = getReactionString(col_amp); + + if (row_reaction != col_reaction) + { + csv += ",0+0j"; // normInt is zero for amplitudes from different reactions + continue; // skip to next column + } + + const NormIntInterface *normInt = norm_int_map[row_reaction]; + if (!normInt) + { + report(ERROR, kModule) << "Null normalization integral interface for reaction " << row_reaction << "\n"; + assert(false); + } + + std::complex value = normInt->normInt(row_amp, col_amp); + + // Format as pandas-readable complex number e.g. 1+2j or 1-2j + std::ostringstream oss; + oss << std::scientific << std::setprecision(std::numeric_limits::max_digits10 - 1); + oss << value.real(); + if (value.imag() >= 0) + { + oss << "+"; + } + oss << value.imag() << "j"; + csv += "," + oss.str(); + } // end column loop + + // don't add newline to last row, to avoid extra blank line at end of file + if (row_idx != m_fit_results.ampList().size() - 1) + csv += "\n"; + + } // end row loop + + return csv; +} + +std::map> FitConverter::findConstrainedAmps() const +{ + std::map> constrained_amp_map; + + if (ampNamesAreConstrained()) + { + for (const auto &unique_amp : uniqueAmpNames()) + { + // the bool assures us that all terms with this amp name are constrained to + // each other + std::vector shared_terms = m_cfg_info->termList("", "", unique_amp); + + // store the full names of the constrained amplitudes + for (const auto &term : shared_terms) + constrained_amp_map[unique_amp].push_back(term->fullName()); + } + return constrained_amp_map; + } + + if (sumAmpNamesAreConstrained()) + { + for (const auto &unique_sum_amp : uniqueSumAmpNames()) + { + std::string sum = unique_sum_amp.substr(0, unique_sum_amp.find("::")); + std::string amp = unique_sum_amp.substr(unique_sum_amp.find("::") + 2); + // the bool assures us that all terms with this sum::amp name are + // constrained to each other + std::vector shared_terms = m_cfg_info->termList("", sum, amp); + + // store the full names of the constrained amplitudes + for (const auto &term : shared_terms) + constrained_amp_map[unique_sum_amp].push_back(term->fullName()); + } + return constrained_amp_map; + } + + // if we reach here, neither assumption held, so just map full amplitude names to + // themselves + for (const auto &litude : m_fit_results.ampList()) + { + constrained_amp_map[amplitude] = {amplitude}; + } + return constrained_amp_map; +} + +bool FitConverter::ampNamesAreConstrained() const +{ + for (const auto &unique_amp : uniqueAmpNames()) + { + std::vector shared_terms = m_cfg_info->termList("", "", unique_amp); + + // we assume terms with common amplitude names are all constrained to each + // other, so use first term to check this + std::vector constrained_terms = shared_terms[0]->constraints(); + + // filter constrained terms to only those with the same amplitude name + for (auto it = constrained_terms.begin(); it != constrained_terms.end();) + { + if ((*it)->fullName().find(unique_amp) == std::string::npos) + it = constrained_terms.erase(it); + else + ++it; + } + + // The constrained terms does not include the self term, so add it back in for + // comparison + constrained_terms.push_back(shared_terms[0]); + + // Compare the two sets of terms to ensure they are the same + std::sort(shared_terms.begin(), shared_terms.end()); + std::sort(constrained_terms.begin(), constrained_terms.end()); + if (shared_terms != constrained_terms) + { + return false; + } + } + return true; +} + +bool FitConverter::sumAmpNamesAreConstrained() const +{ + for (const auto &unique_sum_amp : uniqueSumAmpNames()) + { + std::string sum = unique_sum_amp.substr(0, unique_sum_amp.find("::")); + std::string amp = unique_sum_amp.substr(unique_sum_amp.find("::") + 2); + + std::vector shared_terms = m_cfg_info->termList("", sum, amp); + + // we assume terms with common sum::amp names are all constrained to each + // other, so use first term to check this + std::vector constrained_terms = shared_terms[0]->constraints(); + + // filter constrained terms to only those with the same sum::amp name + for (auto it = constrained_terms.begin(); it != constrained_terms.end();) + { + if ((*it)->fullName().find(unique_sum_amp) == std::string::npos) + it = constrained_terms.erase(it); + else + ++it; + } + + // The constrained terms does not include the self term, so add it back in for + // comparison + constrained_terms.push_back(shared_terms[0]); + + // Compare the two sets of terms to ensure they are the same + std::sort(shared_terms.begin(), shared_terms.end()); + std::sort(constrained_terms.begin(), constrained_terms.end()); + if (shared_terms != constrained_terms) + { + return false; + } + } + return true; +} + +std::string FitConverter::getReactionString(const std::string &full_amplitude) +{ + size_t first_colon = full_amplitude.find("::"); + size_t second_colon = full_amplitude.find("::", first_colon + 2); + if (first_colon == std::string::npos || second_colon == std::string::npos) + { + report(ERROR, kModule) << "Amplitude name " << full_amplitude << " does not match expected format 'reaction::sum::amp'.\n"; + assert(false); + } + std::string reaction = full_amplitude.substr(0, first_colon); + return reaction; +} \ No newline at end of file diff --git a/src/libraries/UTILITIES/FitConverter.h b/src/libraries/UTILITIES/FitConverter.h new file mode 100644 index 000000000..d65bbba65 --- /dev/null +++ b/src/libraries/UTILITIES/FitConverter.h @@ -0,0 +1,339 @@ +/** + * @file FitConverter.h + * @author Kevin Scheuer + * @brief Class for converting AmpTools fit results to any format + * @date 2026-05-08 + * + * @note that this currently is built to convert to CSV, but the class can be extended + * to convert to any other format by accessing the various containers that store + * the fit results. + */ + +#ifndef FIT_CONVERTER_H +#define FIT_CONVERTER_H + +#include +#include +#include +#include +#include +#include + +#include "AmplitudeParser.h" +#include "IUAmpTools/FitResults.h" + +/** + * @class FitConverter + * @brief Converts AmpTools FitResults to any format + * + * This class extracts fit results from a single AmpTools .fit file. It can then output + * the data in various formats, such as CSV. The extracted data includes: + * - Standard fit outputs (likelihood, event counts, status codes) + * - AmpTools parameters + * - Real/Imaginary parts of production coefficients + * - Intensities of unique amplitudes. See constrainedAmps() for how unique amplitudes + * groups are defined. + * - Coherent sums of amplitudes by quantum numbers + * - Phase differences + * + * Example usage in convert_to_csv.cc: + * @code + * std::ofstream csv_file(output_file); + * std::stringstream csv_data; + * bool header_written = false; + * + * for (const auto& file : input_files) { + * FitConverter converter(file, acceptance_corrected); + * + * if (!header_written) { + * csv_data << converter.getCSVHeader() << "\n"; + * header_written = true; + * } + * csv_data << converter.getCSVRow() << "\n"; + * } + * + * csv_file << csv_data.str(); + * csv_file.close(); + * @endcode + */ +class FitConverter +{ +public: + /** + * @brief Construct a FitConverter for a single fit file + * @param[in] filename Path to the .fit file + * @param[in] acceptance_correct Whether to use acceptance-corrected intensities + * @param[in] mute_warning Suppress warning about free parameters if true + * @param[in] naming_scheme Amplitude naming scheme: auto, JLme, eJPmL, or Lme + * @throw ERROR If the fit file cannot be read or is invalid + */ + FitConverter( + const std::string &filename, + const bool &acceptance_correct = false, + bool mute_warning = false, + const std::string &naming_scheme = "auto"); + + /** + * @brief Extract fit data from the .fit file + */ + void extract(); + + /** + * @brief Get CSV header string for the fit results + * + * Constructs a comma-separated header string based on the keys of the various + * containers storing the fit results. + * + * @note It is crucial that the order of entries in the header matches the order of + * values in getCSVRow(). + * + * @return Comma-separated header string + */ + std::string getCSVHeader() const; + + /** + * @brief Get CSV row string for the fit results + * + * Constructs a comma-separated row string based on the values of the various + * containers storing the fit results. + * + * @note It is crucial that the order of values in this row matches the order of + * entries in getCSVHeader(). + * + * @return Comma-separated row string + */ + std::string getCSVRow() const; + + /** + * @brief Get the CSV header string for the covariance matrix + * + * @return std::string "file, parameter, par1, par2, ..., parN" + */ + std::string getCSVCovarianceMatrixHeader() const; + + /** + * @brief Get the CSV representation of the covariance matrix + * + * Every row corresponds to a parameter, with the first two columns being the file + * and parameter name, followed by the covariance values with all other parameters. + */ + std::string getCSVCovarianceMatrix() const; + + /** + * @brief Get the CSV header string for the correlation matrix + * + * @return std::string "file, parameter, par1, par2, ..., parN" + */ + std::string getCSVCorrelationMatrixHeader() const; + + /** + * @brief Get the CSV representation of the correlation matrix + * + * Every row corresponds to a parameter, with the first two columns being the file + * and parameter name, followed by the correlation values with all other parameters. + */ + std::string getCSVCorrelationMatrix() const; + + /** + * @brief Get the CSV header string for the normalization integral matrix + * + * @return std::string "file, amplitude, amp1, amp2, ..., ampN" + */ + std::string getCSVNormIntMatrixHeader() const; + + /** + * @brief Get the CSV representation of the normalization integral matrix + * + * Every row corresponds to an amplitude, with the first two columns being the file + * and amplitude names, followed by complex normalization integral values with all + * other amplitudes. Complex values are formatted as pandas-readable strings (e.g., + * "1+2j"). + */ + std::string getCSVNormIntMatrix() const; + + /** + * @brief return set of all unique amplitude names in a fit result + * + * A "full" amplitude is a "reaction::sum::ampName" string, so this extracts all + * unique "ampName" strings + */ + std::set uniqueAmpNames() const; + + /** + * @brief return set of all unique coherent sum + amp names in a fit result + * + * A "full" amplitude is a "reaction::sum::ampName" string, so this extracts all + * unique "sum::ampName" strings + * + * @return std::set + */ + std::set uniqueSumAmpNames() const; + + bool isValid() const { return m_fit_results.valid(); } + + /** + * @brief Map of AmpTools standard fit outputs and their values + * + * Standard outputs include values such as: + * - eMatrixStatus + * - lastMinuitCommandStatus + * - likelihood + * - intensity + */ + const std::map &standardResults() const { return m_standard_results; } + + /** + * @brief Map of AmpTools parameters and their values and errors + * + * Ignores production coefficients, which typically have "::" in their names + */ + const std::map> ¶meters() const { return m_parameters; } + + /** + * @brief Map of unique amplitude names and their related constrained amplitudes + * + * A "full" amplitude is a "reaction::sum::ampName" string, and often amplitudes + * are constrained across reactions and sums. The goal is to identify the shortest + * identifying string that can be used to group amplitudes that are constrained to + * each other. It will first attempt to group by "ampName", then by "sum::ampName", + * and if neither of those assumptions hold, it will just group by the full + * amplitude string. + * + * For example, if we have the following full amplitude strings: + * - "reaction1::sum1::myAmpName" + * - "reaction1::sum2::myAmpName" + * - "reaction2::sum1::myAmpName" + * + * We first check if all amplitudes with the same "ampName" (myAmpName) are + * constrained to each other, and if so, we group them under the unique key + * "myAmpName". If that assumption does not hold, we check if all amplitudes with + * the same "sum::ampName" are constrained to each other (i.e. across reactions) and + * if so, we group them under the unique key "sum::ampName". If neither assumption + * holds, we just group by the full amplitude name. + * + * @note that the self-constraint is included in the list, so even amplitudes with + * no constraints will have a vector of size one (itself). + */ + const std::map> & + constrainedAmps() const { return m_constrained_amps; } + + /** + * @brief Map of unique amplitude intensities to their values and errors + * + * Contains the intensity (and error) for each unique amplitude grouping, + * calculated using all the constrained amplitudes for that group. + */ + const std::map> & + uniqueAmpIntensities() const { return m_unique_amp_intensities; } + + /** + * @brief Map of coherent-sum labels to their values and errors + * + * The labels are built from the selected amplitude naming scheme and the + * configured coherent-sum groupings + */ + const std::map> & + coherentSumIntensities() const { return m_coherent_sum_intensities; } + + /** + * @brief Map of unique amplitude names to their production coefficients + * + * Contains complex production coefficients for each unique amplitude name + * extracted from the fit results. + */ + const std::map> & + productionCoefficients() const { return m_production_coefficients; } + + /** + * @brief Map of full amplitude pairs to their phase differences and errors + * + * Only includes pairs of unique amplitude names that share a reaction and sum. For + * 2 amplitudes "amp1" and "amp2", we search through every combination in + * m_constrained_amps[amp1] and m_constrained_amps[amp2] to find pairs that share + * the same "reaction::sum". The key is a size 2 vector of the full amplitude + * strings, and the value is a pair of the phase difference and its error. + */ + const std::map, std::pair> & + phaseDifferences() const { return m_phase_differences; } + + /** + * @brief Get the covariance matrix of the fit parameters. + * + * The error matrix is a 2D vector where each entry [i][j] corresponds to + * the covariance between parameter i and parameter j. The order of the rows and + * columns matches the order of parameters in FitResults::parNameList(). Note + * that we do include production parameters in this matrix, as opposed to the + * m_parameters map which excludes them. + */ + const std::vector> & + errorMatrix() const { return m_error_matrix; } + + ~FitConverter() = default; + +private: + const std::string m_fit_file; ///< Path to the .fit file + const FitResults m_fit_results; ///< AmpTools FitResults object + const ConfigurationInfo *m_cfg_info; ///< ConfigurationInfo from the fit + const bool m_is_acceptance_corrected; ///< Whether to use acceptance-corrected intensities + const AmplitudeParser m_amplitude_parser; ///< Parser for amplitude labels and sums + + // see accessor methods for descriptions + std::map m_standard_results; + std::map> m_parameters; + std::map> m_constrained_amps; + std::map> m_unique_amp_intensities; + std::map> m_coherent_sum_intensities; + std::map> m_production_coefficients; + std::map, std::pair> m_phase_differences; + std::vector> m_error_matrix; + + /** + * @brief Get map of unique amplitude groups to their constrained full amplitudes + * + * This attempts to identify the unique strings that can be a coupled to a set of + * constrained amplitudes. For a "full" amplitude string of the form + * "reaction::sum::ampName", it first assumes that amplitudes with the same ampName + * are constrained to each other, across reactions and sums. If not, it peels back a + * layer and assumes that amplitudes with the same "sum::ampName" are constrained to + * each other, across reactions. If that fails, it simply saves a mapping of the + * full amplitude to itself. The map keys are whats extracted as the unique + * amplitude names (ampName, sum::ampName, or full amplitude), and the values are + * vectors of the full amplitude strings that are constrained to each other. + * + * @note the keys are whats written to the CSV, and is why it first attempts to + * extract just the ampName for simplicity. + */ + std::map> findConstrainedAmps() const; + + /** + * @brief Checks if amplitude names are constrained to each other + * + * A "full" amplitude is a "reaction::sum::ampName" string, so this checks if all + * amplitudes with the same "ampName" are constrained to each other across reactions + * and sums. + */ + bool ampNamesAreConstrained() const; + + /** + * @brief Checks if coherent sum + amplitude names are constrained to each other + * + * A "full" amplitude is a "reaction::sum::ampName" string, so this checks if all + * amplitudes with the same "sum::ampName" are constrained to each other across + * reactions. + */ + bool sumAmpNamesAreConstrained() const; + + /** + * @brief Get the reaction string from a full amplitude name + * + * This extracts the reaction part from a "reaction::sum::ampName" string. + * + * @param full_amp_name The full amplitude name + * @return The reaction string + */ + static std::string getReactionString(const std::string &full_amp_name); + + static const char *kModule; +}; + +#endif // FIT_CONVERTER_H \ No newline at end of file diff --git a/src/libraries/UTILITIES/RootDataConverter.cc b/src/libraries/UTILITIES/RootDataConverter.cc new file mode 100644 index 000000000..57bc513b2 --- /dev/null +++ b/src/libraries/UTILITIES/RootDataConverter.cc @@ -0,0 +1,1005 @@ +/** + * @file RootDataConverter.cc + * @author Kevin Scheuer + * @brief Implementation of RootDataConverter class + * @date 2026-05-08 + * + */ + +#include +#include +#include +#include + +#include "RootDataConverter.h" +#include "IUAmpTools/report.h" + +#include "TFile.h" +#include "TKey.h" +#include "TClass.h" +#include "TCollection.h" +#include "TROOT.h" +#include "TTree.h" +#include "TLorentzVector.h" +#include "ROOT/RDataFrame.hxx" + +const char *RootDataConverter::kModule = "RootDataConverter"; + +RootDataConverter::RootDataConverter(const std::string &filename, bool mute_warnings) : RootDataConverter(filename, {1}, mute_warnings) {} + +RootDataConverter::RootDataConverter(const std::string &filename, + const std::vector &lower_vertex_indices, + bool mute_warnings) : m_fit_file(filename), + m_fit_results(filename, mute_warnings), + m_cfg_info(m_fit_results.configInfo()), + m_mute_warnings(mute_warnings), + m_data_files(dataFiles()), + m_background_files(backgroundFiles()), + m_genMC_files(genMCFiles()), + m_accMC_files(accMCFiles()), + m_data_tree_name(findTreeName("data")), + m_background_tree_name(findTreeName("background")), + m_genMC_tree_name(findTreeName("genMC")), + m_accMC_tree_name(findTreeName("accMC")) +{ + report(DEBUG, kModule) << "Constructing RootDataConverter for file: " << m_fit_file << "\n"; + + if (!m_fit_results.valid()) + { + report(ERROR, kModule) << "RootDataConverter ERROR: Unable to read fit results from file " + m_fit_file << "\n"; + assert(false); + } + + setLowerVertexIndices(lower_vertex_indices); + setUpperVertexIndices(); + + // print debug info detailing which particles are labeled as upper or lower vertex + // note that index 0 is always the beam photon + const std::vector particle_list = m_cfg_info->reaction(m_fit_results.reactionList()[0])->particleList(); + report(DEBUG, kModule) << "The following particles have been labeled as lower vertex particles:\n"; + for (const auto &idx : m_lower_vertex_indices) + { + report(DEBUG, kModule) << "i = " << idx << " : " << particle_list[idx] << "\n"; + } + report(DEBUG, kModule) << "The following particles are thus labeled as upper vertex particles:\n"; + for (const auto &idx : m_upper_vertex_indices) + { + report(DEBUG, kModule) << "i = " << idx << " : " << particle_list[idx] << "\n"; + } + + m_background_files_exist = !m_background_files.empty(); + + validateDataFiles(); + validateBackgroundFiles(); // if no bkgnd files exist, warns user that weights assumed to be in data tree + validateGenMCFiles(); + validateAccMCFiles(); + + // fill the m_values map with all the relevant information from the fit results and + // associated files that we want to save to csv later. This includes: + // - Bin edges, centers, averages, and RMS values + // - total number of detected and acceptance-corrected events + // - efficiency based on genMC and accMC event counts + extract(); +} + +void RootDataConverter::extract() +{ + // get weight branch name. If weight branch is not found, will return empty string + // and we'll assume weights of 1.0 throughout our stats functions + std::string weight_branch_name; + if (m_background_files_exist) + weight_branch_name = weightBranchName("background", m_background_tree_name); + else + weight_branch_name = weightBranchName("data", m_data_tree_name); + + if (weight_branch_name.empty()) + { + report(DEBUG, kModule) << "No weight branch found. Assuming weights of 1.0 for all events.\n"; + } + else + { + report(DEBUG, kModule) << "Weight branch found: " << weight_branch_name << "\n"; + } + + // Extract min, max, mean, and RMS of the beam energy, -t, and mass of the upper + // vertex system. Background subtraction and proper weighting are all included. + TH1D *h_beam_energy = beamEnergyHist(weight_branch_name); + m_values["e_low"] = h_beam_energy->GetXaxis()->GetXmin(); + m_values["e_high"] = h_beam_energy->GetXaxis()->GetXmax(); + m_values["e_center"] = 0.5 * (m_values["e_low"] + m_values["e_high"]); + m_values["e_avg"] = h_beam_energy->GetMean(); + m_values["e_rms"] = h_beam_energy->GetRMS(); + + report(DEBUG, kModule) << "Beam energy statistics:\n"; + report(DEBUG, kModule) << " Min: " << m_values["e_low"] << "\n"; + report(DEBUG, kModule) << " Max: " << m_values["e_high"] << "\n"; + report(DEBUG, kModule) << " Center: " << m_values["e_center"] << "\n"; + report(DEBUG, kModule) << " Mean: " << m_values["e_avg"] << "\n"; + report(DEBUG, kModule) << " RMS: " << m_values["e_rms"] << "\n"; + + TH1D *h_four_momentum_transfer = tHist(weight_branch_name); + m_values["t_low"] = h_four_momentum_transfer->GetXaxis()->GetXmin(); + m_values["t_high"] = h_four_momentum_transfer->GetXaxis()->GetXmax(); + m_values["t_center"] = 0.5 * (m_values["t_low"] + m_values["t_high"]); + m_values["t_avg"] = h_four_momentum_transfer->GetMean(); + m_values["t_rms"] = h_four_momentum_transfer->GetRMS(); + + report(DEBUG, kModule) << "-t statistics:" << "\n"; + report(DEBUG, kModule) << " Min: " << m_values["t_low"] << "\n"; + report(DEBUG, kModule) << " Max: " << m_values["t_high"] << "\n"; + report(DEBUG, kModule) << " Center: " << m_values["t_center"] << "\n"; + report(DEBUG, kModule) << " Mean: " << m_values["t_avg"] << "\n"; + report(DEBUG, kModule) << " RMS: " << m_values["t_rms"] << "\n"; + + TH1D *h_upper_vertex_mass = massHist(weight_branch_name); + m_values["m_low"] = h_upper_vertex_mass->GetXaxis()->GetXmin(); + m_values["m_high"] = h_upper_vertex_mass->GetXaxis()->GetXmax(); + m_values["m_center"] = 0.5 * (m_values["m_low"] + m_values["m_high"]); + m_values["m_avg"] = h_upper_vertex_mass->GetMean(); + m_values["m_rms"] = h_upper_vertex_mass->GetRMS(); + + report(DEBUG, kModule) << "Mass statistics:" << "\n"; + report(DEBUG, kModule) << " Min: " << m_values["m_low"] << "\n"; + report(DEBUG, kModule) << " Max: " << m_values["m_high"] << "\n"; + report(DEBUG, kModule) << " Center: " << m_values["m_center"] << "\n"; + report(DEBUG, kModule) << " Mean: " << m_values["m_avg"] << "\n"; + report(DEBUG, kModule) << " RMS: " << m_values["m_rms"] << "\n"; + + // Its not particularly important which histogram is used to calculate the number + // of events, as they all use the same weights. + double events, events_err; + std::tie(events, events_err) = numberOfEvents(h_beam_energy); + m_values["events"] = events; + m_values["events_err"] = events_err; + + report(DEBUG, kModule) << "Total number of events: " << m_values["events"] + << " +/- " << m_values["events_err"] << "\n"; + + // Calculate efficiency and acceptance-corrected number of events + double eff = efficiency(); + m_values["efficiency"] = eff; + report(DEBUG, kModule) << " Efficiency: " << eff << "\n"; + + double ac_events, ac_events_err; + std::tie(ac_events, ac_events_err) = acceptanceCorrectedEvents(events, events_err, eff); + m_values["ac_events"] = ac_events; + m_values["ac_events_err"] = ac_events_err; + report(DEBUG, kModule) << "Acceptance corrected events: " << ac_events << " +/- " << ac_events_err << "\n"; +} + +std::string RootDataConverter::getCSVHeader() const +{ + std::string header = "file"; + for (const auto &pair : m_values) + { + header += "," + pair.first; + } + return header; +} + +std::string RootDataConverter::getCSVRow() const +{ + std::string row = m_fit_file; + for (const auto &pair : m_values) + { + row += "," + std::to_string(pair.second); + } + return row; +} + + +std::vector RootDataConverter::findFiles(const std::string &file_type) const +{ + std::vector files; + + // get directory of the fit file + std::filesystem::path dir = std::filesystem::path(m_fit_file).parent_path(); + + const std::vector reaction_list = m_fit_results.reactionList(); + for (std::string reaction : reaction_list) + { + std::pair> file_pair; + + if (file_type == "data") + file_pair = m_cfg_info->reaction(reaction)->data(); + else if (file_type == "background") + file_pair = m_cfg_info->reaction(reaction)->bkgnd(); + else if (file_type == "genMC") + file_pair = m_cfg_info->reaction(reaction)->genMC(); + else if (file_type == "accMC") + file_pair = m_cfg_info->reaction(reaction)->accMC(); + else + { + report(ERROR, kModule) << "Unknown file type requested: " + << file_type + << "\n"; + assert(false); + } + + std::string data_reader_name = file_pair.first; + std::vector data_reader_args = file_pair.second; + + std::filesystem::path file_path = data_reader_args[0]; + + // assumes that relative paths are relative to the fit file directory + if (!file_path.is_absolute()) + file_path = dir / file_path; + + // canonical will remove any ../ or ./ from the path, while also checking + // for existence + try + { + files.push_back(std::filesystem::canonical(file_path).string()); + } + catch (const std::filesystem::filesystem_error &e) + { + report(ERROR, kModule) << "Error accessing " + << file_type + << " file: " + << file_path << "\n" + << e.what() << "\n"; + assert(false); + } + } + return files; +} + +std::string RootDataConverter::findTreeName(const std::string &file_type) const +{ + const std::vector reaction_list = m_fit_results.reactionList(); + + // assume all reactions use the same tree name, so just get the first one + std::string reaction = reaction_list[0]; + std::pair> file_pair; + std::string file_path; + if (file_type == "data") + { + file_pair = m_cfg_info->reaction(reaction)->data(); + file_path = m_data_files[0]; + } + else if (file_type == "background") + { + file_pair = m_cfg_info->reaction(reaction)->bkgnd(); + file_path = m_background_files[0]; + } + else if (file_type == "genMC") + { + file_pair = m_cfg_info->reaction(reaction)->genMC(); + file_path = m_genMC_files[0]; + } + else if (file_type == "accMC") + { + file_pair = m_cfg_info->reaction(reaction)->accMC(); + file_path = m_accMC_files[0]; + } + else + { + report(ERROR, kModule) << "Unknown file type requested: " + << file_type + << "\n"; + assert(false); + } + + // if only one tree exists in the file, return that tree name + std::vector tree_names = treesInFile(file_path); + if (tree_names.size() == 1) + return tree_names[0]; + + // otherwise, we have to extract the tree name from the data reader arguments + // which unfortunately differ for each data reader + std::string data_reader_name = file_pair.first; + std::vector data_reader_args = file_pair.second; + + if (data_reader_name == "ROOTDataReader" && data_reader_args.size() == 2) + { + return data_reader_args[1]; + } + else if (data_reader_name == "ROOTDataReaderBootstrap" && data_reader_args.size() == 3) + { + return data_reader_args[2]; + } + else if (data_reader_name == "FSRootDataReader") + { + return data_reader_args[1]; + } + else if (data_reader_name == "FSRootDataReaderBootstrap") + { + return data_reader_args[1]; + } + else + { + if (!m_mute_warnings) + report(WARNING, kModule) << "Unknown data reader or insufficient arguments to" + << " determine tree name. Defaulting to 'kin'\n"; + return "kin"; // default tree name + } +} + +std::vector RootDataConverter::treesInFile(const std::string &filename) const +{ + std::vector tree_names; + + TFile *f = TFile::Open(filename.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening ROOT file: " + filename + "\n"; + assert(false); + } + + TIter nextkey(f->GetListOfKeys()); + TKey *key; + while ((key = (TKey *)nextkey())) + { + TClass *cl = gROOT->GetClass(key->GetClassName()); + if (cl->InheritsFrom("TTree")) + { + tree_names.push_back(key->GetName()); + } + } + + f->Close(); + return tree_names; +} + +std::string RootDataConverter::weightBranchName( + const std::string &file_type, + const std::string &tree_name) const +{ + // TODO: verify that this friend tree works. May have to add friend tree to each + // file to access the weight branch later + + // FSRoot-based data readers can optionally pass a friend weight branch name as an + // argument, so check explicitly for that first + const std::vector reaction_list = m_fit_results.reactionList(); + std::string reaction = reaction_list[0]; // assume all reactions use the same weight branch + std::pair> file_pair; // data reader name and args + std::string root_file; + if (file_type == "data") + { + file_pair = m_cfg_info->reaction(reaction)->data(); + root_file = m_data_files[0]; + } + else if (file_type == "background") + { + file_pair = m_cfg_info->reaction(reaction)->bkgnd(); + root_file = m_background_files[0]; + } + else + { + report(ERROR, kModule) << "Unknown file type requested: " + << file_type + << "\n"; + assert(false); + } + + std::string data_reader_name = file_pair.first; + std::vector data_reader_args = file_pair.second; + + // the FSRootDataReader can have special friend trees that hold the event weights + if (data_reader_name.find("FSRootDataReader") != std::string::npos && data_reader_args.size() >= 6) + { + std::string friend_file_name = data_reader_args[3]; + std::string friend_tree_name = data_reader_args[4]; + std::string weight_branch_name = data_reader_args[5]; + + // to access the weight branch, we need to add the friend tree + TFile *f = TFile::Open(root_file.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening ROOT file: " + root_file + "\n"; + assert(false); + } + TTree *tree = f->Get(tree_name.c_str()); + if (!tree) + { + report(ERROR, kModule) << "Error: Tree '" + tree_name + "' not found in file: " + root_file + "\n"; + assert(false); + } + tree->AddFriend(friend_tree_name.c_str(), friend_file_name.c_str()); + return friend_tree_name + "." + weight_branch_name; + } + + // not FSROOTDataReader, so no friend tree weights. Proceed to check for weight + // branches in the main tree + TFile *f = TFile::Open(root_file.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening ROOT file: " + root_file + "\n"; + assert(false); + } + TTree *tree = f->Get(tree_name.c_str()); + if (!tree) + { + report(ERROR, kModule) << "Error: Tree '" + tree_name + "' not found in file: " + root_file + "\n"; + assert(false); + } + // check if weight branch exists before returning it. Most analyses use "weight" or + // "Weight" branches + if (tree->GetBranch("weight")) + { + f->Close(); + return "weight"; + } + else if (tree->GetBranch("Weight")) + { + f->Close(); + return "Weight"; + } + else + { + f->Close(); + if (!m_mute_warnings) + report(WARNING, kModule) << "Error: Weight branch not found in tree '" + << tree_name + << "' in file: " + << root_file + << ". Setting weights to 1.0\n"; + return ""; // return empty string to indicate no weight branch found + } +} + +TH1D *RootDataConverter::beamEnergyHist(const std::string &weight_branch_name) +{ + // we will add all the data (and possible) weighted background energy histograms + // together to get overall beam energy stats + TH1D *h_energy_total = nullptr; + + double min, max; + std::tie(min, max) = findMinMaxOfBranch(m_data_files, m_data_tree_name, "EnPB"); + + if (m_background_files_exist) + { + double bg_min, bg_max; + std::tie(bg_min, bg_max) = findMinMaxOfBranch(m_background_files, m_background_tree_name, "EnPB"); + + // set the min and max for the total histogram to encompass both data and background + min = std::min(min, bg_min); + max = std::max(max, bg_max); + } + + // ==== DATA ENERGY HISTOGRAMS ==== + for (const std::string &data_file : m_data_files) + { + TFile *f = TFile::Open(data_file.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening data ROOT file: " << data_file << "\n"; + assert(false); + } + TTree *tree = f->Get(m_data_tree_name.c_str()); + if (!tree) + { + report(ERROR, kModule) << "Error retrieving tree " << m_data_tree_name << " from file " << data_file << "\n"; + f->Close(); + assert(false); + } + + // Set up branch addresses + double energy; + tree->SetBranchAddress("EnPB", &energy); + + // draw into temporary histogram + const int n_bins = 200; + TH1D *h_energy = new TH1D("h_energy", "", n_bins, min, max); + + // If no background files exist, we assume weights are stored in the data tree + if (!m_background_files_exist && !weight_branch_name.empty() && tree->GetBranch(weight_branch_name.c_str())) + { + double weight; + tree->SetBranchAddress(weight_branch_name.c_str(), &weight); + tree->Draw("EnPB>>h_energy", weight_branch_name.c_str(), "goff"); + } + else if (m_background_files_exist) + { + tree->Draw("EnPB>>h_energy", "1.0", "goff"); // if bkgnd files exist, we will subtract them later, so just use weight of 1.0 here + } + else + { + tree->Draw("EnPB>>h_energy", "1.0", "goff"); + if (!m_mute_warnings) + report(WARNING, kModule) << "No weight branch found in tree '" + << m_data_tree_name + << "' in file: " + << data_file + << "\n" + << "Assuming weights of 1.0 for all events.\n"; + } + + // if this is first hist, clone it to the total hist. Otherwise, add to the total hist + if (!h_energy_total) + { + h_energy_total = (TH1D *)h_energy->Clone("h_energy_total"); + h_energy_total->SetDirectory(0); // detach from file + } + else + { + h_energy_total->Add(h_energy); + } + + delete h_energy; // clean up temporary histogram + f->Close(); + } + + // ==== BACKGROUND ENERGY HISTOGRAMS ==== + // if background files exist, we'll subtract the weighted background energy + // histograms from the total energy histogram, which we will use for our stats. + for (const std::string &bg_file : m_background_files) + { + TFile *f = TFile::Open(bg_file.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening background ROOT file: " << bg_file << "\n"; + assert(false); + } + TTree *tree = f->Get(m_background_tree_name.c_str()); + if (!tree) + { + report(ERROR, kModule) << "Error retrieving tree " << m_background_tree_name << " from file " << bg_file << "\n"; + f->Close(); + assert(false); + } + + // Set up branch addresses + double energy; + double weight = 1.0; // default weight is 1.0 if no weight branch exists + + tree->SetBranchAddress("EnPB", &energy); + + // If weight branch exists, set its address + if (!weight_branch_name.empty() && tree->GetBranch(weight_branch_name.c_str())) + { + tree->SetBranchAddress(weight_branch_name.c_str(), &weight); + } + + // draw into temporary histogram + const int n_bins = 200; + TH1D *h_energy = new TH1D("h_energy", "", n_bins, min, max); + tree->Draw("EnPB>>h_energy", weight_branch_name.c_str(), "goff"); + + // subtract background histogram from total histogram + if (h_energy_total) + h_energy_total->Add(h_energy, -1.0); + + delete h_energy; + f->Close(); + } + + return h_energy_total; +} + +TH1D *RootDataConverter::tHist(const std::string &weight_branch_name) +{ + // target proton mass (GeV) + const double m_proton = 0.938; + + if (m_lower_vertex_indices.empty()) + { + report(ERROR, kModule) << "No lower vertex indices set; cannot compute -t\n"; + assert(false); + } + + // Prepare branch expression strings like "PxP1 + PxP2 + ..." + std::string recoil_px_expr; + std::string recoil_py_expr; + std::string recoil_pz_expr; + std::string recoil_e_expr; + for (size_t i = 0; i < m_lower_vertex_indices.size(); ++i) + { + int idx = m_lower_vertex_indices[i]; + std::string sep = (i == 0) ? "" : " + "; + recoil_px_expr += sep + "PxP" + std::to_string(idx); + recoil_py_expr += sep + "PyP" + std::to_string(idx); + recoil_pz_expr += sep + "PzP" + std::to_string(idx); + recoil_e_expr += sep + "EnP" + std::to_string(idx); + } + + // ---- DATA histogram via RDataFrame ---- + TH1D *h_result = nullptr; + try + { + ROOT::RDataFrame df_data(m_data_tree_name, m_data_files); + + // Define recoil components and t (Mandelstam t = (target - recoil)^2) + auto df = df_data.Define("recoil_px", recoil_px_expr) + .Define("recoil_py", recoil_py_expr) + .Define("recoil_pz", recoil_pz_expr) + .Define("recoil_E", recoil_e_expr) + .Define("t_value", + [m_proton](double rE, double rpx, double rpy, double rpz) + { + double dE = m_proton - rE; + double p2 = rpx * rpx + rpy * rpy + rpz * rpz; + double t = dE * dE - p2; + return std::fabs(t); + }, + {"recoil_E", "recoil_px", "recoil_py", "recoil_pz"}); + + // Determine min and max from data (unweighted) + auto data_min_r = df.Min("t_value"); + auto data_max_r = df.Max("t_value"); + double data_min = *data_min_r; + double data_max = *data_max_r; + + double global_min = data_min; + double global_max = data_max; + + // Build data histogram (if background exists, don't apply weights to data here) + const int n_bins = 200; + ROOT::RDF::RResultPtr h_data; + if (m_background_files_exist) + { + h_data = df.Histo1D({"h_t_data", "t (data)", n_bins, global_min, global_max}, "t_value"); + } + else + { + // if no background files, use weight branch from data (if provided) + if (!weight_branch_name.empty()) + { + // if weight branch references a friend (contains a '.'), unclear if RDataFrame can handle that, so skip weights in that case with a warning + // TODO: test the friend tree case + if (weight_branch_name.find('.') != std::string::npos) + { + report(WARNING, kModule) << "Weight branch appears to reference a friend tree (" << weight_branch_name << ") - skipping weights for RDataFrame histogram\n"; + h_data = df.Histo1D({"h_t_data", "t (data)", n_bins, global_min, global_max}, "t_value"); + } + else + { + h_data = df.Histo1D({"h_t_data", "t (data)", n_bins, global_min, global_max}, "t_value", weight_branch_name); + } + } + else + { + h_data = df.Histo1D({"h_t_data", "t (data)", n_bins, global_min, global_max}, "t_value"); + } + } + + // ---- BACKGROUND histogram via RDataFrame (if present) ---- + if (m_background_files_exist && !m_background_files.empty()) + { + ROOT::RDF::RResultPtr h_bkg; + ROOT::RDataFrame df_bg(m_background_tree_name, m_background_files); + auto dfb = df_bg.Define("recoil_px", recoil_px_expr) + .Define("recoil_py", recoil_py_expr) + .Define("recoil_pz", recoil_pz_expr) + .Define("recoil_E", recoil_e_expr) + .Define("t_value", + [m_proton](double rE, double rpx, double rpy, double rpz) + { + double dE = m_proton - rE; + double p2 = rpx * rpx + rpy * rpy + rpz * rpz; + double t = dE * dE - p2; + return std::fabs(t); + }, + {"recoil_E", "recoil_px", "recoil_py", "recoil_pz"}); + + // Expand global range to include background extremes + auto bg_min_r = dfb.Min("t_value"); + auto bg_max_r = dfb.Max("t_value"); + double bg_min = *bg_min_r; + double bg_max = *bg_max_r; + global_min = std::min(global_min, bg_min); + global_max = std::max(global_max, bg_max); + + // Recreate histograms with unified ranges + h_data = df.Histo1D({"h_t_data", "t (data)", n_bins, global_min, global_max}, "t_value"); + + // background histogram should use weight branch (weight_branch_name was chosen from background earlier) + if (!weight_branch_name.empty() && weight_branch_name.find('.') == std::string::npos) + { + h_bkg = dfb.Histo1D({"h_t_bkg", "t (bg)", n_bins, global_min, global_max}, "t_value", weight_branch_name); + } + else + { + if (weight_branch_name.find('.') != std::string::npos) + report(WARNING, kModule) << "Background weight branch references friend tree; skipping weights for bg RDataFrame histogram\n"; + h_bkg = dfb.Histo1D({"h_t_bkg", "t (bg)", n_bins, global_min, global_max}, "t_value"); + } + + // Subtract background from a standalone clone so RResultPtr ownership stays intact. + h_result = (TH1D *)h_data->Clone("h_t_result"); + h_result->SetDirectory(nullptr); + h_result->Add(h_bkg.GetPtr(), -1.0); + } + else + { + // set h_result for consistent return variable + h_result = (TH1D *)h_data->Clone("h_t_result"); + h_result->SetDirectory(nullptr); + } + } + catch (const std::exception &e) + { + report(ERROR, kModule) << "RDataFrame error computing -t: " << e.what() << "\n"; + assert(false); + } + + return h_result; +} + +TH1D *RootDataConverter::massHist(const std::string &weight_branch_name) +{ + if (m_upper_vertex_indices.empty()) + { + report(ERROR, kModule) << "No upper vertex indices set; cannot compute mass\n"; + assert(false); + } + + // Prepare branch expression strings like "PxP2 + PxP3 + ..." + std::string upper_px_expr; + std::string upper_py_expr; + std::string upper_pz_expr; + std::string upper_e_expr; + for (size_t i = 0; i < m_upper_vertex_indices.size(); ++i) + { + int idx = m_upper_vertex_indices[i]; + std::string sep = (i == 0) ? "" : " + "; + upper_px_expr += sep + "PxP" + std::to_string(idx); + upper_py_expr += sep + "PyP" + std::to_string(idx); + upper_pz_expr += sep + "PzP" + std::to_string(idx); + upper_e_expr += sep + "EnP" + std::to_string(idx); + } + + TH1D *h_result = nullptr; + try + { + // ---- DATA histogram via RDataFrame ---- + ROOT::RDataFrame df_data(m_data_tree_name, m_data_files); + + auto df = df_data.Define("upper_px", upper_px_expr) + .Define("upper_py", upper_py_expr) + .Define("upper_pz", upper_pz_expr) + .Define("upper_E", upper_e_expr) + .Define("m_value", + [](double E, double px, double py, double pz) + { + double p2 = px * px + py * py + pz * pz; + double m2 = E * E - p2; + if (m2 < 0 && m2 > -1e-12) + m2 = 0.0; + return std::sqrt(std::max(0.0, m2)); + }, + {"upper_E", "upper_px", "upper_py", "upper_pz"}); + + // Determine min and max from data (unweighted) + auto data_min_r = df.Min("m_value"); + auto data_max_r = df.Max("m_value"); + double data_min = *data_min_r; + double data_max = *data_max_r; + + double global_min = data_min; + double global_max = data_max; + + // Build data histogram (if background exists, don't apply weights to data here) + const int n_bins = 200; + ROOT::RDF::RResultPtr h_data; + if (m_background_files_exist) + { + h_data = df.Histo1D({"h_m_data", "M (data)", n_bins, global_min, global_max}, "m_value"); + } + else + { + // if no background files, use weight branch from data (if provided) + if (!weight_branch_name.empty() && weight_branch_name.find('.') == std::string::npos) + { + h_data = df.Histo1D({"h_m_data", "M (data)", n_bins, global_min, global_max}, "m_value", weight_branch_name); + } + else + { + h_data = df.Histo1D({"h_m_data", "M (data)", n_bins, global_min, global_max}, "m_value"); + } + } + + // ---- BACKGROUND histogram via RDataFrame (if present) ---- + if (m_background_files_exist && !m_background_files.empty()) + { + ROOT::RDF::RResultPtr h_bkg; + ROOT::RDataFrame df_bg(m_background_tree_name, m_background_files); + auto dfb = df_bg.Define("upper_px", upper_px_expr) + .Define("upper_py", upper_py_expr) + .Define("upper_pz", upper_pz_expr) + .Define("upper_E", upper_e_expr) + .Define("m_value", + [](double E, double px, double py, double pz) + { + double p2 = px * px + py * py + pz * pz; + double m2 = E * E - p2; + if (m2 < 0 && m2 > -1e-12) + m2 = 0.0; + return std::sqrt(std::max(0.0, m2)); + }, + {"upper_E", "upper_px", "upper_py", "upper_pz"}); + + // Expand global range to include background extremes + auto bg_min_r = dfb.Min("m_value"); + auto bg_max_r = dfb.Max("m_value"); + double bg_min = *bg_min_r; + double bg_max = *bg_max_r; + global_min = std::min(global_min, bg_min); + global_max = std::max(global_max, bg_max); + + // Recreate histograms with unified ranges + h_data = df.Histo1D({"h_m_data", "M (data)", n_bins, global_min, global_max}, "m_value"); + + if (!weight_branch_name.empty() && weight_branch_name.find('.') == std::string::npos) + { + h_bkg = dfb.Histo1D({"h_m_bkg", "M (bg)", n_bins, global_min, global_max}, "m_value", weight_branch_name); + } + else + { + h_bkg = dfb.Histo1D({"h_m_bkg", "M (bg)", n_bins, global_min, global_max}, "m_value"); + } + + // Subtract background from a standalone clone so RResultPtr ownership stays intact. + h_result = (TH1D *)h_data->Clone("h_m_result"); + h_result->SetDirectory(nullptr); + h_result->Add(h_bkg.GetPtr(), -1.0); + } + else + { + // set h_result for consistent return variable + h_result = (TH1D *)h_data->Clone("h_m_result"); + h_result->SetDirectory(nullptr); + } + } + catch (const std::exception &e) + { + report(ERROR, kModule) << "RDataFrame error computing mass: " << e.what() << "\n"; + assert(false); + } + + return h_result; +} + +std::pair RootDataConverter::numberOfEvents(TH1D *hist) +{ + if (!hist) + { + report(ERROR, kModule) << "Null histogram pointer passed to extractNumberOfEvents\n"; + assert(false); + } + + double n_events_err; + double n_events = hist->IntegralAndError(0, hist->GetNbinsX() + 1, n_events_err); + + return {n_events, n_events_err}; +} + +double RootDataConverter::efficiency() +{ + // Count events in genMC and accMC files + // Since MC events are not weighted, we just count them directly + + if (m_genMC_files.empty() || m_accMC_files.empty()) + { + report(WARNING, kModule) << "GenMC or AccMC files not found; cannot compute efficiency\n"; + return 0.0; + } + + try + { + // Count generated events + ROOT::RDataFrame df_gen(m_genMC_tree_name, m_genMC_files); + auto gen_count_r = df_gen.Count(); + double gen_events = *gen_count_r; + + // Count accepted events + ROOT::RDataFrame df_acc(m_accMC_tree_name, m_accMC_files); + auto acc_count_r = df_acc.Count(); + double acc_events = *acc_count_r; + + if (gen_events == 0) + { + report(WARNING, kModule) << "No generated events found\n"; + return 0.0; + } + + report(DEBUG, kModule) << "Efficiency calculation:\n"; + report(DEBUG, kModule) << " Generated events: " << gen_events << "\n"; + report(DEBUG, kModule) << " Accepted events: " << acc_events << "\n"; + + return acc_events / gen_events; + } + catch (const std::exception &e) + { + report(ERROR, kModule) << "RDataFrame error computing efficiency: " << e.what() << "\n"; + return 0.0; + } +} + +std::pair RootDataConverter::acceptanceCorrectedEvents( + double events, + double events_err, + double efficiency) const +{ + if (efficiency <= 0.0 || efficiency > 1.0) + { + report(WARNING, kModule) << "Efficiency value out of bounds (0 < eff <= 1): " << efficiency << "\n"; + return {0.0, 0.0}; + } + + double corrected_events = events / efficiency; + double corrected_events_err = events_err / efficiency; + + return {corrected_events, corrected_events_err}; +} + +void RootDataConverter::validateFiles(const std::vector &files, + const std::string &file_type) const +{ + if (files.empty() && file_type != "background") + { + report(ERROR, kModule) << "No " + << file_type << " files found in fit results for file: " + << m_fit_file << "\n"; + assert(false); + } + else if (files.empty() && file_type == "background" && !m_mute_warnings) + { + report(WARNING, kModule) << "No background files found in fit results for file: " + << m_fit_file + << ". Assuming weights are stored directly in the data" + << " trees.\n"; + } + else + { + return; // files are valid + } +} + +std::pair RootDataConverter::findMinMaxOfBranch(const std::vector &files, + const std::string &tree_name, + const std::string &branch_name) const +{ + double global_min = std::numeric_limits::max(); + double global_max = std::numeric_limits::lowest(); + + for (const std::string &file : files) + { + TFile *f = TFile::Open(file.c_str()); + if (!f || f->IsZombie()) + { + report(ERROR, kModule) << "Error opening ROOT file: " << file << "\n"; + assert(false); + } + TTree *tree = f->Get(tree_name.c_str()); + if (!tree) + { + report(ERROR, kModule) << "Error retrieving tree " << tree_name << " from file " << file << "\n"; + f->Close(); + assert(false); + } + + double local_min = tree->GetMinimum(branch_name.c_str()); + double local_max = tree->GetMaximum(branch_name.c_str()); + + if (local_min < global_min) + global_min = local_min; + if (local_max > global_max) + global_max = local_max; + + f->Close(); + } + + return {global_min, global_max}; +} + +void RootDataConverter::setUpperVertexIndices() +{ + if (m_lower_vertex_indices.empty()) + { + report(ERROR, kModule) << "Lower vertex indices must be set before setting upper vertex indices\n"; + assert(false); + } + + m_upper_vertex_indices.clear(); + const std::vector particle_list = m_cfg_info->reaction(m_fit_results.reactionList()[0])->particleList(); + + // Note that we always skip particle i=0, as this will be the beam photon + for (size_t i = 1; i < particle_list.size(); i++) + { + if (std::find(m_lower_vertex_indices.begin(), m_lower_vertex_indices.end(), i) == m_lower_vertex_indices.end()) + { + m_upper_vertex_indices.push_back(i); + } + } +} \ No newline at end of file diff --git a/src/libraries/UTILITIES/RootDataConverter.h b/src/libraries/UTILITIES/RootDataConverter.h new file mode 100644 index 000000000..0da07423f --- /dev/null +++ b/src/libraries/UTILITIES/RootDataConverter.h @@ -0,0 +1,472 @@ +/** + * @file RootDataConverter.h + * @author Kevin Scheuer + * @brief Class for converting ROOT tree data associated with an AmpTools fit to any format + * @date 2026-05-08 + * + * @note that this currently is built to convert to CSV, but the class can be extended + * to convert to any other format by accessing the various containers that store + * the fit results. + */ + +#ifndef DATA_CONVERTER_H +#define DATA_CONVERTER_H + +#include +#include +#include + +#include "TH1D.h" +#include "TCanvas.h" +#include "TFile.h" +#include "TTree.h" + +#include "IUAmpTools/FitResults.h" +#include "IUAmpTools/ConfigurationInfo.h" + +/** + * @class RootDataConverter + * @brief Converts AmpTools fit result data to any format + * + * This class is designed to take a single AmpTools .fit file, extract the associated + * data files and trees used in the fit, and then extract the relevant data from those + * trees to be converted into any desired output format (e.g. CSV). The extracted data + * includes: + * - Bin edges, centers, averages, and RMS values for t, beam energy, and upper vertex + * mass histograms + * - Total number of events and errors for data and acceptance-corrected data + * - Efficiency calculation based on generated and accepted MC event counts + * + * Example usage in convert_to_csv.cc: + * @code + * std::ofstream csv_file(output_file); + * std::stringstream csv_data; + * bool header_written = false; + * + * for (const auto& file : input_files) { + * RootDataConverter converter(file); + * + * if (!header_written) { + * csv_data << converter.getCSVHeader() << "\n"; + * header_written = true; + * } + * csv_data << converter.getCSVRow() << "\n"; + * } + * + * csv_file << csv_data.str(); + * csv_file.close(); + * @endcode + */ +class RootDataConverter +{ +public: + /** + * @brief Construct a RootDataConverter for a single fit file + * + * This assumes the 4-momenta indices are as follows: + * - index 0 (B): beam photon + * - index 1: single lower vertex particle, typically the recoil proton + * - indices 2 to N: final state particles from upper vertex + * + * @param[in] filename path to the .fit file + * @param[in] mute_warnings whether to mute warnings from FitResults and when + * background files are not found + */ + RootDataConverter(const std::string &filename, bool mute_warnings = false); + + /** + * @brief Construct a new Data Converter object with specified lower vertex indices + * + * The 4-momenta indices are as follows: + * - index 0 (B): beam photon + * - lower_vertex_indices: lower vertex particles, e.g. proton, pi+ that decay from Delta++ + * - All leftover indices: final state particles from the upper vertex + * + * @param[in] filename path to the .fit file + * @param[in] lower_vertex_indices 4-momenta indices for lower vertex particles + * @param[in] mute_warnings whether to mute warnings from FitResults and when + * background files are not found + */ + RootDataConverter(const std::string &filename, + const std::vector &lower_vertex_indices, + bool mute_warnings = false); + + /** + * @brief Construct a new Data Converter object with specified lower vertex and isobar indices + * + * This will add new headers and values for isobar mass and efficiency calculations. + * + * The 4-momenta indices are as follows: + * - index 0 (B): beam photon + * - lower_vertex_indices: lower vertex particles, e.g. proton, pi+ that decay from Delta++ + * - isobar_indices: isobar particles, e.g. pi+ pi- from rho + * - all indices not in lower_vertex_indices or 0: final state particles from upper vertex + * + * @todo: implement. make sure no conflicts with lower vertex indices. Also this + * should add isobar mass and event counts to the output. Usually one is interested + * in the distribution though, rather than just the bin centers/edges like we are + * for the total binned mass. For now then, we will leave this as a future + * improvement until a use case arises that requires it. + * + * @param[in] filename path to the .fit file + * @param[in] lower_vertex_indices 4-momenta indices for lower vertex particles + * @param[in] isobar_indices 4-momenta indices for isobar particles + * @param[in] mute_warnings whether to mute warnings from FitResults and when + * background files are not found + */ + RootDataConverter(const std::string &filename, + const std::vector &lower_vertex_indices, + const std::vector &isobar_indices, + bool mute_warnings = false); + + /** + * @brief Extract fit data from the .fit file + * + * Fills the m_values map with all the relevant information from the fit results and + * associated files that we want to save to csv later. + */ + void extract(); + + /** + * @brief Get the CSV header row + * + * Constructs a comma-separated header string based on the keys of the m_values map. + * + * @note It is crucial that the order of entries in the header matches the order of + * values in getCSVRow(). + * + * @return std::string Comma-separated header string + */ + std::string getCSVHeader() const; + + /** + * @brief Get the CSV row + * + * Constructs a comma-separated row string based on the values of the m_values map. + * + * @note It is crucial that the order of values in this row matches the order of + * entries in getCSVHeader(). + * + * @return std::string Comma-separated row string + */ + std::string getCSVRow() const; + + /** + * @brief Data files associated with the fit results + * @note It assumes one data file per reaction, and that it is the first argument + * passed to the data reader. + * @return std::vector where each element is an absolute path to a data file + */ + std::vector dataFiles() const { return findFiles("data"); }; + + /** + * @brief Background files associated with the fit results, if they exist + * @note It assumes one background file per reaction, and that it is the first + * argument passed to the data reader. + * @return std::vector where each element is an absolute path to a background file + */ + std::vector backgroundFiles() const { return findFiles("background"); }; + + /** + * @brief Generated MC files associated with the fit results + * @note It assumes one genMC file per reaction, and that it is the first argument + * passed to the data reader. + * @return std::vector where each element is an absolute path to a genMC file + */ + std::vector genMCFiles() const { return findFiles("genMC"); }; + + /** + * @brief Accepted MC files associated with the fit results + * @note It assumes one accMC file per reaction, and that it is the first argument + * passed to the data reader. + * @return std::vector where each element is an absolute path to an accMC file + */ + std::vector accMCFiles() const { return findFiles("accMC"); }; + + /** + * @brief Validate that data files were found + */ + + void validateDataFiles() const { validateFiles(m_data_files, "data"); } + /** + * @brief Validate that background files were found + * + * If no background files are found, it will warn the user and assume that weights + * are stored directly in the data tree. + */ + void validateBackgroundFiles() const { validateFiles(m_background_files, "background"); } + + /** + * @brief Validate that generated Monte Carlo files were found + */ + void validateGenMCFiles() const { validateFiles(m_genMC_files, "genMC"); } + + /** + * @brief Validate that accepted Monte Carlo files were found + */ + void validateAccMCFiles() const { validateFiles(m_accMC_files, "accMC"); } + + /** + * @brief data tree name associated with the fit results + * If one tree exists in the data files, that tree name is returned. Otherwise, + * the tree name is extracted based on the arguments given to known data readers. + * If the data reader is unknown, it will warn and default to "kin". + * @return std::string tree name + */ + std::string dataTreeName() const { return findTreeName("data"); }; + + /** + * @brief background tree name associated with the fit results + * If one tree exists in the data files, that tree name is returned. Otherwise, + * the tree name is extracted based on the arguments given to known data readers. + * If the data reader is unknown, it will warn and default to "kin". + * @return std::string tree name + */ + std::string backgroundTreeName() const { return findTreeName("background"); }; + + /** + * @brief generated MC tree name associated with the fit results + * If one tree exists in the data files, that tree name is returned. Otherwise, + * the tree name is extracted based on the arguments given to known data readers. + * If the data reader is unknown, it will warn and default to "kin". + * @return std::string tree name + */ + std::string genMCTreeName() const { return findTreeName("genMC"); }; + + /** + * @brief accepted MC tree name associated with the fit results + * If one tree exists in the data files, that tree name is returned. Otherwise, + * the tree name is extracted based on the arguments given to known data readers. + * If the data reader is unknown, it will warn and default to "kin". + * @return std::string tree name + */ + std::string accMCTreeName() const { return findTreeName("accMC"); }; + + /** + * @brief Get the name of the weight branch in the specified tree and file type + * + * Some data readers, such as FSRootDataReader, can store the event weights in a + * friend tree. This function will check for that case first, and return the + * appropriate weight branch name. If no friend tree is used, it will check for + * "weight" or "Weight" branches in the main tree. If those branches do not exist, + * it will return an empty string to indicate no weight branch found. + * + * @param[in] file_type The type of file ("data", "background") + * @param[in] tree_name The name of the tree within the file + * @return std::string The name of the weight branch, or empty string if none found + * + * @todo FSRoot friend tree case untested + */ + std::string weightBranchName( + const std::string &file_type, + const std::string &tree_name) const; + + /** + * @brief Set indices that belong to the lower vertex final state particles + * + * An AmpTools ROOT tree contains 4-vectors for all particles in the event, labelled + * "EnX", "PxX", "PyX", "PzX" where X is the particle index. The "B" index is always + * the beam photon, but the other indices are not guaranteed to be in any particular + * order. The simplest case is a recoil proton at the lower vertex, which would just + * be index 1. However, there can be multiple particles associated with the lower + * vertex like a Delta++ -> p + pi+ decay. + * + * This function allows the user to specify which indices correspond to the lower + * vertex system. The remaining indices are assumed to be final state particles from + * the upper vertex. This is important for correctly calculating the 4-momentum + * transfer t and the upper vertex mass. + */ + void setLowerVertexIndices(const std::vector &indices) { m_lower_vertex_indices = indices; } + + /** + * @brief Get indices for the lower vertex particles + * + * See setLowerVertexIndices for more details. + */ + std::vector getLowerVertexIndices() const { return m_lower_vertex_indices; } + + /** + * @brief Set indices that belong to the upper vertex final state particles + * + * Any particle indices not in m_lower_vertex_indices that are also not 0 are thus + * assumed to belong to the upper vertex. This saves all leftover indices to + * m_upper_vertex_indices. This is important for correctly calculating the upper + * vertex mass. + * + * @note This function must be called AFTER setLowerVertexIndices + */ + void setUpperVertexIndices(); + + /** + * @brief Get indices for the upper vertex particles + * + * See setUpperVertexIndices for more details. + */ + std::vector getUpperVertexIndices() const { return m_upper_vertex_indices; } + + /** + * @brief Map of header names to their values + */ + std::map getValues() const { return m_values; } + + bool isValid() const { return m_fit_results.valid(); } + + ~RootDataConverter() = default; + +private: + const std::string m_fit_file; ///< Path to the .fit file + const FitResults m_fit_results; ///< AmpTools FitResults object + const ConfigurationInfo *m_cfg_info; ///< ConfigurationInfo from the fit + std::vector m_lower_vertex_indices; ///< Indices of lower vertex particles in the data 4-vectors + std::vector m_upper_vertex_indices; ///< Indices of upper vertex particles in the data 4-vectors + std::vector m_isobar_indices; ///< Indices of isobar particles in the data 4-vectors + bool m_mute_warnings; ///< Whether to mute warnings about missing background files + + const std::vector m_data_files; + const std::vector m_background_files; + const std::vector m_genMC_files; + const std::vector m_accMC_files; + + bool m_background_files_exist; + + const std::string m_data_tree_name; + const std::string m_background_tree_name; + const std::string m_genMC_tree_name; + const std::string m_accMC_tree_name; + + std::map m_values; //< Map of headers to their values for the CSV output (except for the "file" header) + + /** + * @brief Find files of a given type associated with the fit results + * @param[in] file_type data, background, genMC, or accMC + * @note It assumes one file per reaction, and that it is the first argument passed + * to the data reader. + * @return std::vector where each element is an absolute path to a file of the given type + */ + std::vector findFiles(const std::string &file_type) const; + + /** + * @brief Confirm that files vector is non-empty, unless its background + * + * Generally will throw an error if no files are present for a given type. If + * file_type is "background", this can happen, but it warns the user that it assumes + * any event weights are thus stored directly in the data trees. + * + * @param files absolute paths of files associated with the fit results for a given type + * @param file_type data, background, genMC, or accMC + */ + void validateFiles(const std::vector &files, + const std::string &file_type) const; + + /** + * @brief Find the tree name associated with the fit results + * + * This function will first attempt to see if only one tree exists in the ROOT file. + * If multiple trees exist, it will extract the tree name based on the arguments + * given to known data readers in halld_sim. If the data reader is unknown, it will + * warn and default to "kin". + * + * @param[in] file_type data, background, genMC, or accMC + * + * @return std::string tree name + */ + std::string findTreeName(const std::string &file_type) const; + + /** + * @brief Get the names of all trees in a ROOT file + * + * @param filename Path to the ROOT file + * @return std::vector List of tree names in the file + */ + std::vector treesInFile(const std::string &filename) const; + + /** + * @brief Get the histogram of the beam energy distribution + * + * @param[in] weight_branch_name Name of the weight branch (if empty, weights assumed to be 1.0) + * + * @return TH1D* pointer to the beam energy histogram (background subtracted if background files exist) + */ + TH1D *beamEnergyHist(const std::string &weight_branch_name); + + /** + * @brief Get the -t 4-momentum transfer histogram + * + * This is calculated as t = (P_proton - P_recoil)^2, where P_recoil is the sum of + * the 4-vectors of the lower vertex particles (see setLowerVertexIndices) + * + * @param weight_branch_name Name of the weight branch (if empty, weights assumed to be 1.0) + * + * @return TH1D* pointer to the -t histogram (background subtracted if background files exist) + * + * @todo The function currently has not been tested for the FSRootFriendTree scenario, + * and so warns the user and assigns a weight of 1.0 for this case. + */ + TH1D *tHist(const std::string &weight_branch_name); + + /** + * @brief Get the mass histogram for the upper vertex system + * + * Calculated as the invariant mass of the sum of the 4-vectors of the upper vertex + * particles (see setUpperVertexIndices). + * + * @param weight_branch_name Name of the weight branch (if empty, weights assumed to be 1.0) + * + * @return TH1D* pointer to the upper vertex mass histogram (background subtracted if background files exist) + * + * @todo The function currently has not been tested for the FSRootFriendTree scenario, + * and so warns the user and assigns a weight of 1.0 for this case. + */ + TH1D *massHist(const std::string &weight_branch_name); + + /** + * @brief Extract the total number of events and associated error from a given histogram + * + * Use one of the weighted histograms from the beam energy, -t, or mass calculations + * to extract the total number of events and associated error using the + * IntegralAndError method. + * + * @param hist a pointer to a TH1D histogram from which to extract the total number of events and error + * + * @return std::pair where the first element is the total number of + * events and the second element is the associated error + */ + std::pair numberOfEvents(TH1D *hist); + + /** + * @brief Calculate the efficiency based on the number of generated and accepted MC events + * + * It's assumed this is a binned fit, so we are not interested in the shape of the + * acceptance within the bin, just the efficiency value. + * + * @return double accepted / generated number of events + */ + double efficiency(); + + /** + * @brief Calculate the acceptance-corrected number of events and associated error + * + * @param events total number of events. See numberOfEvents. + * @param events_err error on total number of events. See numberOfEvents. + * @param efficiency acceptance efficiency. See efficiency(). + * @return std::pair acceptance-corrected number of events and + * associated error + */ + std::pair acceptanceCorrectedEvents(double events, double events_err, double efficiency) const; + + /** + * @brief Get the max/min values of a branch for a set of files + * + * @param files set of ROOT files containing a common tree and branch name + * @param tree_name ROOT tree containing the branch of interest + * @param branch_name branch name we want to find the max/min values of + * @return std::pair minimum and maximum values of the branch across all files + */ + std::pair findMinMaxOfBranch(const std::vector &files, + const std::string &tree_name, + const std::string &branch_name) const; + + static const char *kModule; +}; + +#endif // DATA_CONVERTER_H \ No newline at end of file diff --git a/src/libraries/UTILITIES/SConscript b/src/libraries/UTILITIES/SConscript index 42c557c7b..c7c73ad06 100644 --- a/src/libraries/UTILITIES/SConscript +++ b/src/libraries/UTILITIES/SConscript @@ -5,8 +5,11 @@ import sbms # get env object and clone it Import('*') -env = env.Clone() +if os.getenv('AMPTOOLS', 'nada')!='nada': -sbms.AddROOT(env) -sbms.AddCCDB(env) -sbms.library(env) + env = env.Clone() + + sbms.AddROOT(env) + sbms.AddAmpTools(env) + sbms.AddCCDB(env) + sbms.library(env) diff --git a/src/programs/AmplitudeAnalysis/SConscript b/src/programs/AmplitudeAnalysis/SConscript index 2e64c9056..d704834d0 100644 --- a/src/programs/AmplitudeAnalysis/SConscript +++ b/src/programs/AmplitudeAnalysis/SConscript @@ -3,7 +3,7 @@ import sbms Import('*') -subdirs = ['fit', 'twopi_plotter', 'twopi_plotter_amp', 'twopi_plotter_mom', 'twopi_plotter_primakoff', 'twolepton_plotter', 'twoleptonGJ_plotter', 'split_mass', 'split_t', 'threepi_plotter_schilling', 'omega_radiative_plotter', 'project_moments', 'plot_etapi_delta', 'project_moments_polarized', 'Bootstrap_plot_etapi_delta_SPDG_allamps_mass_t_bins', 'Pol_moments_viafittedPW', 'project_moments_SPD_etapi0_posepsilon', 'omegapi_plotter', 'vecps_plotter', 'plot_etapi0'] +subdirs = ['fit', 'twopi_plotter', 'twopi_plotter_amp', 'twopi_plotter_mom', 'twopi_plotter_primakoff', 'twolepton_plotter', 'twoleptonGJ_plotter', 'split_mass', 'split_t', 'threepi_plotter_schilling', 'omega_radiative_plotter', 'project_moments', 'plot_etapi_delta', 'project_moments_polarized', 'Bootstrap_plot_etapi_delta_SPDG_allamps_mass_t_bins', 'Pol_moments_viafittedPW', 'project_moments_SPD_etapi0_posepsilon', 'omegapi_plotter', 'vecps_plotter', 'plot_etapi0', 'convert_to_csv'] SConscript(dirs=subdirs, exports='env osname', duplicate=0) diff --git a/src/programs/AmplitudeAnalysis/convert_to_csv/SConscript b/src/programs/AmplitudeAnalysis/convert_to_csv/SConscript new file mode 100644 index 000000000..423109ba1 --- /dev/null +++ b/src/programs/AmplitudeAnalysis/convert_to_csv/SConscript @@ -0,0 +1,20 @@ +import os +import sbms + +# get env object and clone it +Import('*') + +# Verify AMPTOOLS environment variable is set +if os.getenv('AMPTOOLS', 'nada')!='nada': + + env = env.Clone() + + AMPTOOLS_LIBS = "AMPTOOLS_AMPS AMPTOOLS_DATAIO AMPTOOLS_MCGEN UTILITIES" + env.AppendUnique(LIBS = AMPTOOLS_LIBS.split()) + + sbms.AddHDDM(env) + sbms.AddAmpTools(env) + sbms.AddUtilities(env) + sbms.AddROOT(env) + + sbms.executable(env) \ No newline at end of file diff --git a/src/programs/AmplitudeAnalysis/convert_to_csv/convert_to_csv.cc b/src/programs/AmplitudeAnalysis/convert_to_csv/convert_to_csv.cc new file mode 100644 index 000000000..40f02fa94 --- /dev/null +++ b/src/programs/AmplitudeAnalysis/convert_to_csv/convert_to_csv.cc @@ -0,0 +1,513 @@ +/** + * @file convert_to_csv.cc + * @author Kevin Scheuer + * @brief A utility for converting AmpTools .fit files to CSV format + * @date 2026-05-08 + * + * This program converts AmpTools fit results in .fit files to CSV format. It can also + * optionally create separate CSV files for the covariance, correlation, and + * normalization integral matrices, as well as a CSV file containing the data used in + * the fit. The program is designed to be flexible and can handle various amplitude + * naming schemes, which it uses to group amplitudes into coherent sums based on shared + * quantum numbers. The output CSV files can then be easily imported into just about any + * data analysis or plotting software. + * + * Print the help message with + * @verbatim + * user@host:~$ convert_to_csv -h + * @endverbatim + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "UTILITIES/FitConverter.h" +#include "UTILITIES/RootDataConverter.h" +#include "IUAmpTools/report.h" + +const char *kModule = "convert_to_csv"; + +// forward declarations +std::vector sort_files(const std::vector &files, int sort_index); +bool are_valid_fit_files(const std::vector &files); + +int main(int argc, char *argv[]) +{ + std::vector args(argv, argv + argc); + + // ==== PARSE ARGUMENTS ==== + // required arguments + std::vector input_files; + std::string output_file = ""; + + // optional arguments + bool is_sorted = false; + int sort_index = -1; + bool is_acceptance_corrected = false; + std::vector lower_vertex_indices; + std::string naming_scheme = "auto"; + bool create_data_file = false; + bool create_covariance = false; + bool create_correlation = false; + bool create_norm_int = false; + bool verbose = false; + bool preview = false; + + auto print_help = []() + { + report(INFO, kModule) << "Usage: convert_to_csv [-h] [-i INPUT_FILES] [-o OUTPUT_PATH] [-s] [--sort-index INDEX] [-a] [-d] [-l LOWER_VERTEX_INDICES] [-n NAMING_SCHEME] [-p] [-v] [--correlation] [--covariance] [--norm-int] \n"; + report(INFO, kModule) << " -i INPUT_FILES:\t\tFull path to the .fit file(s)\n"; + report(INFO, kModule) << " -o OUTPUT_PATH:\t\tFull path to the output .csv file\n"; + report(INFO, kModule) << " -s, --sort:\t\t\tSort files by last number in the file name or path (default:true)\n"; + report(INFO, kModule) << " --sort-index INDEX:\t\tWhat number in file path to sort by (default:-1, so last number in path is used)\n"; + report(INFO, kModule) << " -a --acceptance-correct:\tAcceptance correct the intensities\n"; + report(INFO, kModule) << " -d --data-file:\t\tCreate separate data file\n"; + report(INFO, kModule) << " -l --lower-vertex-indices:\tSpecify indices of lower vertex particles in the data 4-vectors\n"; + report(INFO, kModule) << " -n --naming-scheme:\t\tAmp naming scheme for coherent sum groupings. Default is auto (infers scheme from the amp names)\n"; + report(INFO, kModule) << " -p --preview:\t\t\tPrint files to be processed, but don't run\n"; + report(INFO, kModule) << " -v --verbose:\t\t\tPrint information from converter scripts as they run\n"; + report(INFO, kModule) << " --correlation:\t\tSave correlation matrix to separate csv file\n"; + report(INFO, kModule) << " --covariance:\t\t\tSave covariance matrix to separate csv file\n"; + report(INFO, kModule) << " --norm-int:\t\t\tSave normalization integral matrix to separate csv file\n"; + report(INFO, kModule) << " -h, --help:\t\t\tShow this help message\n"; + }; + + // first check if help message is requested + for (const auto &arg : args) + { + if (arg == "-h" || arg == "--help") + { + print_help(); + return 0; + } + } + + // helper to distinguish flags from negative numbers, since both use "-" character + auto is_flag = [](std::string_view s) + { + return s.starts_with("-") && s.size() > 1 && !std::isdigit(static_cast(s[1])); + }; + + // parse arguments + for (size_t i = 1; i < args.size(); ++i) + { + auto arg = args[i]; + + if (arg == "-i" || arg == "--input") + { + // collect all following arguments until next flag + while (i + 1 < args.size() && !is_flag(args[i + 1])) + { + input_files.emplace_back(args[++i]); + } + } + else if (arg == "-o" || arg == "--output") + { + if (i + 1 < args.size() && !is_flag(args[i + 1])) + { + output_file = std::string(args[++i]); + } + else + { + report(ERROR, kModule) << "Error: -o/--output requires a value. See help message.\n"; + print_help(); + return 1; + } + } + else if (arg == "-s" || arg == "--sort") + { + is_sorted = true; + } + else if (arg == "--sort-index") + { + if (i + 1 < args.size() && !is_flag(args[i + 1])) + { + auto val = args[++i]; + auto [ptr, ec] = std::from_chars(val.data(), val.data() + val.size(), sort_index); + if (ec != std::errc()) + { + report(ERROR, kModule) << "Error: --sort-index requires a valid integer. See help message.\n"; + print_help(); + return 1; + } + } + else + { + report(ERROR, kModule) << "Error: --sort-index requires a value. See help message.\n"; + print_help(); + return 1; + } + } + else if (arg == "-a" || arg == "--acceptance-correct") + { + is_acceptance_corrected = true; + } + else if (arg == "-d" || arg == "--data-file") + { + create_data_file = true; + } + else if (arg == "--covariance") + { + create_covariance = true; + } + else if (arg == "--correlation") + { + create_correlation = true; + } + else if (arg == "--norm-int") + { + create_norm_int = true; + } + else if (arg == "-v" || arg == "--verbose") + { + verbose = true; + } + else if (arg == "-p" || arg == "--preview") + { + preview = true; + } + else if (arg == "-l" || arg == "--lower-vertex-indices") + { + // collect all following arguments until next flag + while (i + 1 < args.size() && !is_flag(args[i + 1])) + { + auto val = args[++i]; + int index; + auto [ptr, ec] = std::from_chars(val.data(), val.data() + val.size(), index); + if (ec != std::errc()) + { + report(ERROR, kModule) << "Error: -l/--lower-vertex-indices requires valid integers. See help message.\n"; + print_help(); + return 1; + } + lower_vertex_indices.push_back(index); + } + } + else if (arg == "-n" || arg == "--naming-scheme") + { + if (i + 1 < args.size() && !is_flag(args[i + 1])) + { + naming_scheme = std::string(args[++i]); + } + else + { + report(ERROR, kModule) << "Error: -n/--naming-scheme requires a value. See help message.\n"; + print_help(); + return 1; + } + } + else if (is_flag(arg)) + { + report(ERROR, kModule) << "Unknown argument: " << arg << "\n"; + print_help(); + return 1; + } + } + + // ==== VALIDATE INPUTS ==== + + // Error checks for required arguments + if (input_files.empty()) + { + report(ERROR, kModule) << "Input files must be provided. See help message." << "\n"; + print_help(); + return 1; + } + if (output_file.empty()) + { + report(ERROR, kModule) << "Output file must be provided. See help message." << "\n"; + print_help(); + return 1; + } + + // ensure only .fit files are provided + if (!are_valid_fit_files(input_files)) + { + report(ERROR, kModule) << "One or more input files are not valid .fit files.\n"; + return 1; + } + + // guarantee that output file ends with .csv + if (std::filesystem::path(output_file).extension() != ".csv") + { + output_file += ".csv"; + } + + // sort files if requested + if (is_sorted) + input_files = sort_files(input_files, sort_index); + + if (preview) + { + report(INFO, kModule) << "Preview mode enabled. Printing files and exiting\n"; + for (const auto &file : input_files) + { + report(INFO, kModule) << "\t" << file << "\n"; + } + return 0; + } + + // ==== PROCESS FILES AND WRITE TO CSV ==== + std::stringstream csv_result_data, csv_cov_data, csv_corr_data, csv_norm_int_data, csv_root_data; + bool header_written = false; + std::string first_file_result_header, first_file_cov_header, first_file_corr_header, first_file_norm_int_header, first_file_data_header; + + for (const auto &file : input_files) + { + if (verbose) + report(INFO, kModule) << "Processing file: " << file << "\n"; + FitConverter converter(file, is_acceptance_corrected, !verbose, naming_scheme); + std::unique_ptr data_converter; + if (create_data_file) + { + if (!lower_vertex_indices.empty()) + { + data_converter = std::make_unique(file, lower_vertex_indices, !verbose); + } + else + { + data_converter = std::make_unique(file, !verbose); + } + } + + if (!header_written) + { + std::string header = converter.getCSVHeader(); + csv_result_data << header << "\n"; + header_written = true; + first_file_result_header = header; + + if (create_covariance) + { + std::string cov_header = converter.getCSVCovarianceMatrixHeader(); + csv_cov_data << cov_header << "\n"; + first_file_cov_header = cov_header; + } + + if (create_correlation) + { + std::string corr_header = converter.getCSVCorrelationMatrixHeader(); + csv_corr_data << corr_header << "\n"; + first_file_corr_header = corr_header; + } + + if (create_norm_int) + { + std::string norm_int_header = converter.getCSVNormIntMatrixHeader(); + csv_norm_int_data << norm_int_header << "\n"; + first_file_norm_int_header = norm_int_header; + } + + if (create_data_file) + { + std::string data_header = data_converter->getCSVHeader(); + csv_root_data << data_header << "\n"; + first_file_data_header = data_header; + } + } + else + { + // this block ensures all files have the same csv header format. If this is + // not the case, then the csv files will be malformed and difficult to work + // with, so we enforce that here before processing any files + std::string header = converter.getCSVHeader(); + if (header != first_file_result_header) + { + report(ERROR, kModule) << "File " + << file + << " has a different CSV header than the first file. Aborting\n"; + return 1; + } + + if (create_covariance) + { + std::string cov_header = converter.getCSVCovarianceMatrixHeader(); + if (cov_header != first_file_cov_header) + { + report(ERROR, kModule) << "File " + << file + << " has a different covariance CSV header than the first file. Aborting\n"; + return 1; + } + } + if (create_correlation) + { + std::string corr_header = converter.getCSVCorrelationMatrixHeader(); + if (corr_header != first_file_corr_header) + { + report(ERROR, kModule) << "File " + << file + << " has a different correlation CSV header than the first file. Aborting\n"; + return 1; + } + } + if (create_norm_int) + { + std::string norm_int_header = converter.getCSVNormIntMatrixHeader(); + if (norm_int_header != first_file_norm_int_header) + { + report(ERROR, kModule) << "File " + << file + << " has a different normalization integral CSV header than the first file. Aborting\n"; + return 1; + } + } + if (create_data_file) + { + std::string data_header = data_converter->getCSVHeader(); + if (data_header != first_file_data_header) + { + report(ERROR, kModule) << "File " + << file + << " has a different data CSV header than the first file. Aborting\n"; + return 1; + } + } + } + csv_result_data << converter.getCSVRow() << "\n"; + + if (create_covariance) + csv_cov_data << converter.getCSVCovarianceMatrix() << "\n"; + if (create_correlation) + csv_corr_data << converter.getCSVCorrelationMatrix() << "\n"; + if (create_norm_int) + csv_norm_int_data << converter.getCSVNormIntMatrix() << "\n"; + if (create_data_file) + csv_root_data << data_converter->getCSVRow() << "\n"; + + // loop to next input file to make a new row + } + + // write the csv data to the output file + std::ofstream result_file(output_file); + result_file << csv_result_data.str(); + result_file.close(); + + // covariance and correlation matrix files are written to separate files in the same + // directory as the main output file, with suffixes _covariance.csv and + // _correlation.csv + if (create_covariance) + { + std::filesystem::path output_path(output_file); + std::string covariance_file = (output_path.parent_path() / (output_path.stem().string() + "_covariance.csv")).string(); + std::ofstream cov_file(covariance_file); + cov_file << csv_cov_data.str(); + cov_file.close(); + } + if (create_correlation) + { + std::filesystem::path output_path(output_file); + std::string correlation_file = (output_path.parent_path() / (output_path.stem().string() + "_correlation.csv")).string(); + std::ofstream corr_file(correlation_file); + corr_file << csv_corr_data.str(); + corr_file.close(); + } + if (create_norm_int) + { + std::filesystem::path output_path(output_file); + std::string norm_int_file = (output_path.parent_path() / (output_path.stem().string() + "_norm_int.csv")).string(); + std::ofstream norm_int_out_file(norm_int_file); + norm_int_out_file << csv_norm_int_data.str(); + norm_int_out_file.close(); + } + if (create_data_file) + { + std::filesystem::path output_path(output_file); + std::string data_file = (output_path.parent_path() / (output_path.stem().string() + "_data.csv")).string(); + std::ofstream data_out_file(data_file); + data_out_file << csv_root_data.str(); + data_out_file.close(); + } + + return 0; +} + +/** + * @brief Sort files based on a number extracted from the file path + * + * Extracts all numbers (including decimals) from each file path and sorts based on + * the number at the specified index position. + * + * @param[in] files Vector of file paths to sort + * @param[in] sort_index Index of the number to sort by (supports negative indexing). + * Default -1 means use the last number found in the path. + * @return std::vector Sorted vector of file paths + */ +std::vector sort_files(const std::vector &files, int sort_index) +{ + auto extract_number = [sort_index](const std::string &path) -> double + { + // Regex pattern to match integers and decimal numbers + std::regex number_pattern(R"(\d*\.?\d+)"); + std::vector numbers; + + // Find all numbers in the path + auto begin = std::sregex_iterator(path.begin(), path.end(), number_pattern); + auto end = std::sregex_iterator(); + + for (auto it = begin; it != end; ++it) + { + numbers.push_back(std::stod(it->str())); + } + + if (numbers.empty()) + { + return std::numeric_limits::infinity(); + } + + int index = sort_index; + + // handle negative indexing + if (index < 0) + { + index = static_cast(numbers.size()) + index; + } + + // Return the number at the specified index, or infinity if out of bounds + if (index >= 0 && index < static_cast(numbers.size())) + { + return numbers[index]; + } + + return std::numeric_limits::infinity(); + }; + + // Create a copy and sort it + std::vector sorted_files = files; + std::sort(sorted_files.begin(), sorted_files.end(), + [&extract_number](const std::string &a, const std::string &b) + { + return extract_number(a) < extract_number(b); + }); + + return sorted_files; +} + +/** + * @brief Check if all provided files are .fit files + * + * @param[in] files Vector of file paths to check + * @return true if all files have a .fit extension and exist, false otherwise + * @return false if any file is not a .fit file or does not exist + */ +bool are_valid_fit_files(const std::vector &files) +{ + for (const auto &file : files) + { + if (std::filesystem::path(file).extension() != ".fit" || !std::filesystem::exists(file)) + { + return false; + } + } + return true; +} \ No newline at end of file