From a873d03339a2c032a469e1ceb1c4b0d539fe3818 Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Wed, 17 Jun 2026 16:39:25 +0500 Subject: [PATCH 1/9] Add install script for physics engine dependency --- algorithms/PIR/install.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 algorithms/PIR/install.sh diff --git a/algorithms/PIR/install.sh b/algorithms/PIR/install.sh new file mode 100644 index 00000000..8b21597b --- /dev/null +++ b/algorithms/PIR/install.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pip install "git+https://github.com/Qazi-pk/physics-engine.git@v3.4" From f81860b351ad497834e94e73d651b54e53459fe6 Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Wed, 17 Jun 2026 16:48:26 +0500 Subject: [PATCH 2/9] Add metadata for PIR project Added metadata for the Physics Intermediate Representation (PIR) project, including authors, paper title, and description. --- algorithms/PIR/metadata.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 algorithms/PIR/metadata.yml diff --git a/algorithms/PIR/metadata.yml b/algorithms/PIR/metadata.yml new file mode 100644 index 00000000..89a46a60 --- /dev/null +++ b/algorithms/PIR/metadata.yml @@ -0,0 +1,12 @@ +# PIR — Physics Intermediate Representation +authors: + - name: Qazi Hanif + email: qmhanif70@gmail.com +key: PIR +paper: + title: "PIR: Physics Intermediate Representation for Automated Discovery of Physical Laws" + url: https://doi.org/10.5281/zenodo.19723561 +description: > + Classical symbolic regression via monomial-basis search with log-linearization + gate for power-law detection, pairwise structure decomposition, RANSAC, + sparse regression, and iterative residual refinement. No neural components. From 25f12b7a5338b52998c4cf60ad3010306ddfca0e Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Wed, 17 Jun 2026 16:50:52 +0500 Subject: [PATCH 3/9] Add POT to requirements.txt --- algorithms/PIR/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 algorithms/PIR/requirements.txt diff --git a/algorithms/PIR/requirements.txt b/algorithms/PIR/requirements.txt new file mode 100644 index 00000000..99f773a0 --- /dev/null +++ b/algorithms/PIR/requirements.txt @@ -0,0 +1 @@ +POT From 5d2fc167742f2ecbb7b8129c0ee9ca9ec1d944e2 Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Wed, 17 Jun 2026 16:53:48 +0500 Subject: [PATCH 4/9] Add PIRClassicRegressor and model interface Implement PIRClassicRegressor and model function for SRBench. --- algorithms/PIR/regressor.py | 199 ++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 algorithms/PIR/regressor.py diff --git a/algorithms/PIR/regressor.py b/algorithms/PIR/regressor.py new file mode 100644 index 00000000..7e6113c5 --- /dev/null +++ b/algorithms/PIR/regressor.py @@ -0,0 +1,199 @@ +""" +SRBench submission: PIR (Physics Intermediate Representation) -- classical engine. + +This file is GLUE only. It wraps the public, pip-installable classical PIR engine +(installed via install.sh from a stable source repo) in the interface SRBench +expects: + + est a scikit-learn-compatible Regressor instance + model(est, X=None) returns a sympy-parseable string for the fitted model + eval_kwargs method-specific args forwarded to evaluate_model.py + +No JEPA, no flow prior, no OT loss. Vanilla classical PIR only -- this is the +configuration that produced the archived blind Tier A baseline (13/44 stable, +~29.5% mean over 5 seeds, results_tierA_blind/). + +============================================================================= +THREE THINGS YOU MUST CONFIRM BEFORE OPENING THE PR (do not skip these) +----------------------------------------------------------------------------- +[1] PUBLIC ENGINE URL. SRBench CI cannot pull a private repo. Set the repo + URL + ref in install.sh. The import below must resolve from that public + package. + +[2] CONFIG MUST MATCH THE ARCHIVED BASELINE. The kwargs in _PIR_VANILLA_CONFIG + below must be byte-for-byte the configuration used by sweep_tierA_blind.py + that produced results_tierA_blind/. Otherwise the submitted number won't be + the number you archived -- an integrity gap. Open sweep_tierA_blind.py and + copy the exact PIRRegressor(...) kwargs here. The values below are + placeholders based on the handoff (use_ot_loss=False, max_train_rows=800) + and are NOT yet verified against the sweep script. + +[3] VARIABLE NAMES. model() must return a string whose symbols match the + SRBench dataset's column names (X.columns), NOT your local feynman_loader + names. SRBench/PMLB Feynman columns may differ from ~/feynman_data/. Run the + smoke test in SUBMISSION_NOTES.md against one real SRBench dataset and + confirm the returned symbols match before trusting CI. +============================================================================= +""" + +import signal +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, RegressorMixin + +# The classical engine. install.sh pip-installs this from the PUBLIC repo. +# Path per handoff: physics_engine/sklearn_adapter.py defines PIRRegressor. +from physics_engine.sklearn_adapter import PIRRegressor + + +# --- [2] EXACT vanilla config that produced results_tierA_blind/ ------------- +# CONFIRM these against sweep_tierA_blind.py before submitting. Anything that +# changes the discovered expression (filtering flags, search depth, subsample, +# seeds handling) belongs here so the submission reproduces the archived 13/44. +_PIR_VANILLA_CONFIG = dict( + enforce_dimensions=False, # blind sweep ran with dim-filter OFF + allowed_powers=[1, 2], # powers 1,2 only (the structural cap) + include_pairwise_products=True, # pairwise on; no 3-var assembly + use_ransac=True, + use_residual=True, + use_sparse=True, + use_ot_loss=False, + add_physics_features=False, + # random_state passed via the wrapper (defaults to 0, matching the + # archived blind sweep which used SEED = 0). All other params + # (alpha, beta, max_basis_terms, lambda_penalty, max_train_rows) are + # left at adapter defaults -- the blind sweep did not set them. +) + + +class _Timeout(Exception): + pass + + +def _on_alarm(signum, frame): + raise _Timeout() + + +class PIRClassicRegressor(BaseEstimator, RegressorMixin): + """Thin sklearn wrapper around classical PIRRegressor. + + Adds: a `max_time` budget enforced via SIGALRM (required by SRBench), a + `random_state` attribute (required by SRBench), and a guaranteed-valid + fallback model so model() never raises even if fit is interrupted. + """ + + def __init__(self, max_time=3600, random_state=0, **pir_kwargs): + self.max_time = max_time + self.random_state = random_state + # Merge pinned vanilla config with any overrides passed by the harness. + self.pir_kwargs = {**_PIR_VANILLA_CONFIG, **pir_kwargs} + + def _build(self): + kw = dict(self.pir_kwargs) + # Pass random_state through only if the engine accepts it; harmless + # to set as attribute either way. + try: + return PIRRegressor(random_state=self.random_state, **kw) + except TypeError: + return PIRRegressor(**kw) + + def fit(self, X, y): + # Guarantee a valid model exists before we risk a timeout: constant + # = mean(y). evaluate_model.py can always score this. + y_arr = np.asarray(y, dtype=float).ravel() + self._fallback_expr_ = repr(float(np.mean(y_arr))) if y_arr.size else "0.0" + self.expr_ = self._fallback_expr_ + self._inner = self._build() + + use_alarm = hasattr(signal, "SIGALRM") and self.max_time and self.max_time > 0 + old_handler = None + if use_alarm: + old_handler = signal.signal(signal.SIGALRM, _on_alarm) + signal.alarm(int(self.max_time)) + try: + self._inner.fit(X, y) + # Adapter exposes model() -> str(self.expr_); fall back to .expr_. + if hasattr(self._inner, "model"): + self.expr_ = self._inner.model() + elif hasattr(self._inner, "expr_"): + self.expr_ = str(self._inner.expr_) + except _Timeout: + # Keep the constant fallback already stored in self.expr_. + pass + finally: + if use_alarm: + signal.alarm(0) + if old_handler is not None: + signal.signal(signal.SIGALRM, old_handler) + self.is_fitted_ = True + return self + + def predict(self, X): + if hasattr(self, "_inner") and hasattr(self._inner, "predict"): + try: + return self._inner.predict(X) + except Exception: + pass + # Fallback: constant prediction (matches the fallback expr). + n = X.shape[0] if hasattr(X, "shape") else len(X) + try: + val = float(self._fallback_expr_) + except (TypeError, ValueError): + val = 0.0 + return np.full(n, val, dtype=float) + + def model(self): + return self.expr_ + + +# The estimator SRBench will fit. max_time is the param SRBench controls; +# the harness also sends SIGALRM at the process level if fit() overruns. +est = PIRClassicRegressor(max_time=3600, random_state=None) + + +def model(est, X=None): + """Return a sympy-parseable model string with symbols matching X.columns. + + Uses the SRBench-documented mapping idiom only if the engine emitted + generic names (x_0, x0, X0, ...). If the engine already uses the dataset + column names (as the handoff example '1.0*Ef*q2' suggests), the string is + returned unchanged. + """ + expr = est.model() if hasattr(est, "model") else str(getattr(est, "expr_", "0.0")) + + if X is None or not hasattr(X, "columns"): + return expr + + cols = list(X.columns) + # If any real column name already appears, assume names are correct. + if any(str(c) in expr for c in cols): + return expr + + # Otherwise remap generic positional names -> dataset columns. + # reversed() so 'x_1' doesn't clobber the prefix of 'x_10'. + import re + for prefix in ("x_", "x", "X_", "X"): + if re.search(rf"\b{prefix}\d+\b", expr): + mapping = {f"{prefix}{i}": str(k) for i, k in enumerate(cols)} + for k, v in reversed(list(mapping.items())): + expr = re.sub(rf"\b{re.escape(k)}\b", v, expr) + break + return expr + + +# --- forwarded to evaluate_model.py ------------------------------------------ +# CRITICAL: scale_x/scale_y MUST be False. SRBench StandardScales X and y by +# default, which destroys the units and exact coefficients PIR depends on +# (a scaled run recovers ~nothing). The dev harness already forces these off on +# the symbolic-data track, but we pin them so a non-sym run can't quietly zero +# us out. skip_tuning=True: PIR has no GridSearch tuning step to run. +# +# NOTE: these keys target the `dev` branch signature (where PRs go): +# evaluate_model(..., scale_x, scale_y, pre_train, skip_tuning, sym_data) +# If you test against `master` instead, it uses `test_params` rather than +# `skip_tuning`; see SUBMISSION_NOTES.md. +eval_kwargs = { + "scale_x": False, + "scale_y": False, + "skip_tuning": True, +} From 460ed0e9be3a1434f18368e6e73147e13a358c4b Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Thu, 18 Jun 2026 11:51:42 +0500 Subject: [PATCH 5/9] Delete algorithms/PIR directory --- algorithms/PIR/install.sh | 2 - algorithms/PIR/metadata.yml | 12 -- algorithms/PIR/regressor.py | 199 -------------------------------- algorithms/PIR/requirements.txt | 1 - 4 files changed, 214 deletions(-) delete mode 100644 algorithms/PIR/install.sh delete mode 100644 algorithms/PIR/metadata.yml delete mode 100644 algorithms/PIR/regressor.py delete mode 100644 algorithms/PIR/requirements.txt diff --git a/algorithms/PIR/install.sh b/algorithms/PIR/install.sh deleted file mode 100644 index 8b21597b..00000000 --- a/algorithms/PIR/install.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -pip install "git+https://github.com/Qazi-pk/physics-engine.git@v3.4" diff --git a/algorithms/PIR/metadata.yml b/algorithms/PIR/metadata.yml deleted file mode 100644 index 89a46a60..00000000 --- a/algorithms/PIR/metadata.yml +++ /dev/null @@ -1,12 +0,0 @@ -# PIR — Physics Intermediate Representation -authors: - - name: Qazi Hanif - email: qmhanif70@gmail.com -key: PIR -paper: - title: "PIR: Physics Intermediate Representation for Automated Discovery of Physical Laws" - url: https://doi.org/10.5281/zenodo.19723561 -description: > - Classical symbolic regression via monomial-basis search with log-linearization - gate for power-law detection, pairwise structure decomposition, RANSAC, - sparse regression, and iterative residual refinement. No neural components. diff --git a/algorithms/PIR/regressor.py b/algorithms/PIR/regressor.py deleted file mode 100644 index 7e6113c5..00000000 --- a/algorithms/PIR/regressor.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -SRBench submission: PIR (Physics Intermediate Representation) -- classical engine. - -This file is GLUE only. It wraps the public, pip-installable classical PIR engine -(installed via install.sh from a stable source repo) in the interface SRBench -expects: - - est a scikit-learn-compatible Regressor instance - model(est, X=None) returns a sympy-parseable string for the fitted model - eval_kwargs method-specific args forwarded to evaluate_model.py - -No JEPA, no flow prior, no OT loss. Vanilla classical PIR only -- this is the -configuration that produced the archived blind Tier A baseline (13/44 stable, -~29.5% mean over 5 seeds, results_tierA_blind/). - -============================================================================= -THREE THINGS YOU MUST CONFIRM BEFORE OPENING THE PR (do not skip these) ------------------------------------------------------------------------------ -[1] PUBLIC ENGINE URL. SRBench CI cannot pull a private repo. Set the repo - URL + ref in install.sh. The import below must resolve from that public - package. - -[2] CONFIG MUST MATCH THE ARCHIVED BASELINE. The kwargs in _PIR_VANILLA_CONFIG - below must be byte-for-byte the configuration used by sweep_tierA_blind.py - that produced results_tierA_blind/. Otherwise the submitted number won't be - the number you archived -- an integrity gap. Open sweep_tierA_blind.py and - copy the exact PIRRegressor(...) kwargs here. The values below are - placeholders based on the handoff (use_ot_loss=False, max_train_rows=800) - and are NOT yet verified against the sweep script. - -[3] VARIABLE NAMES. model() must return a string whose symbols match the - SRBench dataset's column names (X.columns), NOT your local feynman_loader - names. SRBench/PMLB Feynman columns may differ from ~/feynman_data/. Run the - smoke test in SUBMISSION_NOTES.md against one real SRBench dataset and - confirm the returned symbols match before trusting CI. -============================================================================= -""" - -import signal -import numpy as np -import pandas as pd -from sklearn.base import BaseEstimator, RegressorMixin - -# The classical engine. install.sh pip-installs this from the PUBLIC repo. -# Path per handoff: physics_engine/sklearn_adapter.py defines PIRRegressor. -from physics_engine.sklearn_adapter import PIRRegressor - - -# --- [2] EXACT vanilla config that produced results_tierA_blind/ ------------- -# CONFIRM these against sweep_tierA_blind.py before submitting. Anything that -# changes the discovered expression (filtering flags, search depth, subsample, -# seeds handling) belongs here so the submission reproduces the archived 13/44. -_PIR_VANILLA_CONFIG = dict( - enforce_dimensions=False, # blind sweep ran with dim-filter OFF - allowed_powers=[1, 2], # powers 1,2 only (the structural cap) - include_pairwise_products=True, # pairwise on; no 3-var assembly - use_ransac=True, - use_residual=True, - use_sparse=True, - use_ot_loss=False, - add_physics_features=False, - # random_state passed via the wrapper (defaults to 0, matching the - # archived blind sweep which used SEED = 0). All other params - # (alpha, beta, max_basis_terms, lambda_penalty, max_train_rows) are - # left at adapter defaults -- the blind sweep did not set them. -) - - -class _Timeout(Exception): - pass - - -def _on_alarm(signum, frame): - raise _Timeout() - - -class PIRClassicRegressor(BaseEstimator, RegressorMixin): - """Thin sklearn wrapper around classical PIRRegressor. - - Adds: a `max_time` budget enforced via SIGALRM (required by SRBench), a - `random_state` attribute (required by SRBench), and a guaranteed-valid - fallback model so model() never raises even if fit is interrupted. - """ - - def __init__(self, max_time=3600, random_state=0, **pir_kwargs): - self.max_time = max_time - self.random_state = random_state - # Merge pinned vanilla config with any overrides passed by the harness. - self.pir_kwargs = {**_PIR_VANILLA_CONFIG, **pir_kwargs} - - def _build(self): - kw = dict(self.pir_kwargs) - # Pass random_state through only if the engine accepts it; harmless - # to set as attribute either way. - try: - return PIRRegressor(random_state=self.random_state, **kw) - except TypeError: - return PIRRegressor(**kw) - - def fit(self, X, y): - # Guarantee a valid model exists before we risk a timeout: constant - # = mean(y). evaluate_model.py can always score this. - y_arr = np.asarray(y, dtype=float).ravel() - self._fallback_expr_ = repr(float(np.mean(y_arr))) if y_arr.size else "0.0" - self.expr_ = self._fallback_expr_ - self._inner = self._build() - - use_alarm = hasattr(signal, "SIGALRM") and self.max_time and self.max_time > 0 - old_handler = None - if use_alarm: - old_handler = signal.signal(signal.SIGALRM, _on_alarm) - signal.alarm(int(self.max_time)) - try: - self._inner.fit(X, y) - # Adapter exposes model() -> str(self.expr_); fall back to .expr_. - if hasattr(self._inner, "model"): - self.expr_ = self._inner.model() - elif hasattr(self._inner, "expr_"): - self.expr_ = str(self._inner.expr_) - except _Timeout: - # Keep the constant fallback already stored in self.expr_. - pass - finally: - if use_alarm: - signal.alarm(0) - if old_handler is not None: - signal.signal(signal.SIGALRM, old_handler) - self.is_fitted_ = True - return self - - def predict(self, X): - if hasattr(self, "_inner") and hasattr(self._inner, "predict"): - try: - return self._inner.predict(X) - except Exception: - pass - # Fallback: constant prediction (matches the fallback expr). - n = X.shape[0] if hasattr(X, "shape") else len(X) - try: - val = float(self._fallback_expr_) - except (TypeError, ValueError): - val = 0.0 - return np.full(n, val, dtype=float) - - def model(self): - return self.expr_ - - -# The estimator SRBench will fit. max_time is the param SRBench controls; -# the harness also sends SIGALRM at the process level if fit() overruns. -est = PIRClassicRegressor(max_time=3600, random_state=None) - - -def model(est, X=None): - """Return a sympy-parseable model string with symbols matching X.columns. - - Uses the SRBench-documented mapping idiom only if the engine emitted - generic names (x_0, x0, X0, ...). If the engine already uses the dataset - column names (as the handoff example '1.0*Ef*q2' suggests), the string is - returned unchanged. - """ - expr = est.model() if hasattr(est, "model") else str(getattr(est, "expr_", "0.0")) - - if X is None or not hasattr(X, "columns"): - return expr - - cols = list(X.columns) - # If any real column name already appears, assume names are correct. - if any(str(c) in expr for c in cols): - return expr - - # Otherwise remap generic positional names -> dataset columns. - # reversed() so 'x_1' doesn't clobber the prefix of 'x_10'. - import re - for prefix in ("x_", "x", "X_", "X"): - if re.search(rf"\b{prefix}\d+\b", expr): - mapping = {f"{prefix}{i}": str(k) for i, k in enumerate(cols)} - for k, v in reversed(list(mapping.items())): - expr = re.sub(rf"\b{re.escape(k)}\b", v, expr) - break - return expr - - -# --- forwarded to evaluate_model.py ------------------------------------------ -# CRITICAL: scale_x/scale_y MUST be False. SRBench StandardScales X and y by -# default, which destroys the units and exact coefficients PIR depends on -# (a scaled run recovers ~nothing). The dev harness already forces these off on -# the symbolic-data track, but we pin them so a non-sym run can't quietly zero -# us out. skip_tuning=True: PIR has no GridSearch tuning step to run. -# -# NOTE: these keys target the `dev` branch signature (where PRs go): -# evaluate_model(..., scale_x, scale_y, pre_train, skip_tuning, sym_data) -# If you test against `master` instead, it uses `test_params` rather than -# `skip_tuning`; see SUBMISSION_NOTES.md. -eval_kwargs = { - "scale_x": False, - "scale_y": False, - "skip_tuning": True, -} diff --git a/algorithms/PIR/requirements.txt b/algorithms/PIR/requirements.txt deleted file mode 100644 index 99f773a0..00000000 --- a/algorithms/PIR/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -POT From 11fc617593b83a0c58a18844f0e6e25fedbe365c Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Thu, 18 Jun 2026 12:41:01 +0500 Subject: [PATCH 6/9] Add install script for physics engine dependency --- algorithms/pir/install.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 algorithms/pir/install.sh diff --git a/algorithms/pir/install.sh b/algorithms/pir/install.sh new file mode 100644 index 00000000..8b21597b --- /dev/null +++ b/algorithms/pir/install.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pip install "git+https://github.com/Qazi-pk/physics-engine.git@v3.4" From ad352322047bc8e9aaea2f5fe287a874991212c3 Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Thu, 18 Jun 2026 12:43:40 +0500 Subject: [PATCH 7/9] Add metadata for Physics Intermediate Representation Added metadata for the Physics Intermediate Representation (PIR) including authors, paper title, and description. --- algorithms/pir/metadata.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 algorithms/pir/metadata.yml diff --git a/algorithms/pir/metadata.yml b/algorithms/pir/metadata.yml new file mode 100644 index 00000000..89a46a60 --- /dev/null +++ b/algorithms/pir/metadata.yml @@ -0,0 +1,12 @@ +# PIR — Physics Intermediate Representation +authors: + - name: Qazi Hanif + email: qmhanif70@gmail.com +key: PIR +paper: + title: "PIR: Physics Intermediate Representation for Automated Discovery of Physical Laws" + url: https://doi.org/10.5281/zenodo.19723561 +description: > + Classical symbolic regression via monomial-basis search with log-linearization + gate for power-law detection, pairwise structure decomposition, RANSAC, + sparse regression, and iterative residual refinement. No neural components. From 45ef46bc7f8ac2bfccf104975d132a995518462b Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Thu, 18 Jun 2026 12:45:17 +0500 Subject: [PATCH 8/9] Add POT to requirements.txt --- algorithms/pir/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 algorithms/pir/requirements.txt diff --git a/algorithms/pir/requirements.txt b/algorithms/pir/requirements.txt new file mode 100644 index 00000000..99f773a0 --- /dev/null +++ b/algorithms/pir/requirements.txt @@ -0,0 +1 @@ +POT From 0c83abca501241355fd5c125074dc3c61ee637a3 Mon Sep 17 00:00:00 2001 From: Qazi-pk Date: Thu, 18 Jun 2026 12:47:47 +0500 Subject: [PATCH 9/9] Add PIRClassicRegressor for SRBench integration This file implements a classical PIR regressor for SRBench, including configuration and model handling. --- algorithms/pir/regressor.py | 199 ++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 algorithms/pir/regressor.py diff --git a/algorithms/pir/regressor.py b/algorithms/pir/regressor.py new file mode 100644 index 00000000..7e6113c5 --- /dev/null +++ b/algorithms/pir/regressor.py @@ -0,0 +1,199 @@ +""" +SRBench submission: PIR (Physics Intermediate Representation) -- classical engine. + +This file is GLUE only. It wraps the public, pip-installable classical PIR engine +(installed via install.sh from a stable source repo) in the interface SRBench +expects: + + est a scikit-learn-compatible Regressor instance + model(est, X=None) returns a sympy-parseable string for the fitted model + eval_kwargs method-specific args forwarded to evaluate_model.py + +No JEPA, no flow prior, no OT loss. Vanilla classical PIR only -- this is the +configuration that produced the archived blind Tier A baseline (13/44 stable, +~29.5% mean over 5 seeds, results_tierA_blind/). + +============================================================================= +THREE THINGS YOU MUST CONFIRM BEFORE OPENING THE PR (do not skip these) +----------------------------------------------------------------------------- +[1] PUBLIC ENGINE URL. SRBench CI cannot pull a private repo. Set the repo + URL + ref in install.sh. The import below must resolve from that public + package. + +[2] CONFIG MUST MATCH THE ARCHIVED BASELINE. The kwargs in _PIR_VANILLA_CONFIG + below must be byte-for-byte the configuration used by sweep_tierA_blind.py + that produced results_tierA_blind/. Otherwise the submitted number won't be + the number you archived -- an integrity gap. Open sweep_tierA_blind.py and + copy the exact PIRRegressor(...) kwargs here. The values below are + placeholders based on the handoff (use_ot_loss=False, max_train_rows=800) + and are NOT yet verified against the sweep script. + +[3] VARIABLE NAMES. model() must return a string whose symbols match the + SRBench dataset's column names (X.columns), NOT your local feynman_loader + names. SRBench/PMLB Feynman columns may differ from ~/feynman_data/. Run the + smoke test in SUBMISSION_NOTES.md against one real SRBench dataset and + confirm the returned symbols match before trusting CI. +============================================================================= +""" + +import signal +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, RegressorMixin + +# The classical engine. install.sh pip-installs this from the PUBLIC repo. +# Path per handoff: physics_engine/sklearn_adapter.py defines PIRRegressor. +from physics_engine.sklearn_adapter import PIRRegressor + + +# --- [2] EXACT vanilla config that produced results_tierA_blind/ ------------- +# CONFIRM these against sweep_tierA_blind.py before submitting. Anything that +# changes the discovered expression (filtering flags, search depth, subsample, +# seeds handling) belongs here so the submission reproduces the archived 13/44. +_PIR_VANILLA_CONFIG = dict( + enforce_dimensions=False, # blind sweep ran with dim-filter OFF + allowed_powers=[1, 2], # powers 1,2 only (the structural cap) + include_pairwise_products=True, # pairwise on; no 3-var assembly + use_ransac=True, + use_residual=True, + use_sparse=True, + use_ot_loss=False, + add_physics_features=False, + # random_state passed via the wrapper (defaults to 0, matching the + # archived blind sweep which used SEED = 0). All other params + # (alpha, beta, max_basis_terms, lambda_penalty, max_train_rows) are + # left at adapter defaults -- the blind sweep did not set them. +) + + +class _Timeout(Exception): + pass + + +def _on_alarm(signum, frame): + raise _Timeout() + + +class PIRClassicRegressor(BaseEstimator, RegressorMixin): + """Thin sklearn wrapper around classical PIRRegressor. + + Adds: a `max_time` budget enforced via SIGALRM (required by SRBench), a + `random_state` attribute (required by SRBench), and a guaranteed-valid + fallback model so model() never raises even if fit is interrupted. + """ + + def __init__(self, max_time=3600, random_state=0, **pir_kwargs): + self.max_time = max_time + self.random_state = random_state + # Merge pinned vanilla config with any overrides passed by the harness. + self.pir_kwargs = {**_PIR_VANILLA_CONFIG, **pir_kwargs} + + def _build(self): + kw = dict(self.pir_kwargs) + # Pass random_state through only if the engine accepts it; harmless + # to set as attribute either way. + try: + return PIRRegressor(random_state=self.random_state, **kw) + except TypeError: + return PIRRegressor(**kw) + + def fit(self, X, y): + # Guarantee a valid model exists before we risk a timeout: constant + # = mean(y). evaluate_model.py can always score this. + y_arr = np.asarray(y, dtype=float).ravel() + self._fallback_expr_ = repr(float(np.mean(y_arr))) if y_arr.size else "0.0" + self.expr_ = self._fallback_expr_ + self._inner = self._build() + + use_alarm = hasattr(signal, "SIGALRM") and self.max_time and self.max_time > 0 + old_handler = None + if use_alarm: + old_handler = signal.signal(signal.SIGALRM, _on_alarm) + signal.alarm(int(self.max_time)) + try: + self._inner.fit(X, y) + # Adapter exposes model() -> str(self.expr_); fall back to .expr_. + if hasattr(self._inner, "model"): + self.expr_ = self._inner.model() + elif hasattr(self._inner, "expr_"): + self.expr_ = str(self._inner.expr_) + except _Timeout: + # Keep the constant fallback already stored in self.expr_. + pass + finally: + if use_alarm: + signal.alarm(0) + if old_handler is not None: + signal.signal(signal.SIGALRM, old_handler) + self.is_fitted_ = True + return self + + def predict(self, X): + if hasattr(self, "_inner") and hasattr(self._inner, "predict"): + try: + return self._inner.predict(X) + except Exception: + pass + # Fallback: constant prediction (matches the fallback expr). + n = X.shape[0] if hasattr(X, "shape") else len(X) + try: + val = float(self._fallback_expr_) + except (TypeError, ValueError): + val = 0.0 + return np.full(n, val, dtype=float) + + def model(self): + return self.expr_ + + +# The estimator SRBench will fit. max_time is the param SRBench controls; +# the harness also sends SIGALRM at the process level if fit() overruns. +est = PIRClassicRegressor(max_time=3600, random_state=None) + + +def model(est, X=None): + """Return a sympy-parseable model string with symbols matching X.columns. + + Uses the SRBench-documented mapping idiom only if the engine emitted + generic names (x_0, x0, X0, ...). If the engine already uses the dataset + column names (as the handoff example '1.0*Ef*q2' suggests), the string is + returned unchanged. + """ + expr = est.model() if hasattr(est, "model") else str(getattr(est, "expr_", "0.0")) + + if X is None or not hasattr(X, "columns"): + return expr + + cols = list(X.columns) + # If any real column name already appears, assume names are correct. + if any(str(c) in expr for c in cols): + return expr + + # Otherwise remap generic positional names -> dataset columns. + # reversed() so 'x_1' doesn't clobber the prefix of 'x_10'. + import re + for prefix in ("x_", "x", "X_", "X"): + if re.search(rf"\b{prefix}\d+\b", expr): + mapping = {f"{prefix}{i}": str(k) for i, k in enumerate(cols)} + for k, v in reversed(list(mapping.items())): + expr = re.sub(rf"\b{re.escape(k)}\b", v, expr) + break + return expr + + +# --- forwarded to evaluate_model.py ------------------------------------------ +# CRITICAL: scale_x/scale_y MUST be False. SRBench StandardScales X and y by +# default, which destroys the units and exact coefficients PIR depends on +# (a scaled run recovers ~nothing). The dev harness already forces these off on +# the symbolic-data track, but we pin them so a non-sym run can't quietly zero +# us out. skip_tuning=True: PIR has no GridSearch tuning step to run. +# +# NOTE: these keys target the `dev` branch signature (where PRs go): +# evaluate_model(..., scale_x, scale_y, pre_train, skip_tuning, sym_data) +# If you test against `master` instead, it uses `test_params` rather than +# `skip_tuning`; see SUBMISSION_NOTES.md. +eval_kwargs = { + "scale_x": False, + "scale_y": False, + "skip_tuning": True, +}