From 596afa4eddadc1db201eeccaa691d7e1e9d54463 Mon Sep 17 00:00:00 2001 From: jinbora0720 Date: Fri, 20 Mar 2026 12:53:49 -0400 Subject: [PATCH 1/6] first publish --- src/geom_nmf/__init__.py | 36 +++- src/geom_nmf/_base.py | 65 -------- src/geom_nmf/_viz.py | 77 --------- src/geom_nmf/core.py | 318 ++++++++++++++++++++++++++++++++++++ src/geom_nmf/endmembers.py | 327 +++++++++++++++++++++++++++++++++++++ src/geom_nmf/nfindr.py | 319 ++++++++++++++++++++++++++++++++++++ src/geom_nmf/plot.py | 128 +++++++++++++++ src/geom_nmf/utils.py | 58 +++++++ src/geom_nmf/weights.py | 146 +++++++++++++++++ tests/test_basics.py | 5 - tests/test_model.py | 135 ++++++++------- tests/test_viz.py | 44 ----- 12 files changed, 1394 insertions(+), 264 deletions(-) delete mode 100644 src/geom_nmf/_base.py delete mode 100644 src/geom_nmf/_viz.py create mode 100644 src/geom_nmf/core.py create mode 100644 src/geom_nmf/endmembers.py create mode 100644 src/geom_nmf/nfindr.py create mode 100644 src/geom_nmf/plot.py create mode 100644 src/geom_nmf/utils.py create mode 100644 src/geom_nmf/weights.py delete mode 100644 tests/test_basics.py delete mode 100644 tests/test_viz.py diff --git a/src/geom_nmf/__init__.py b/src/geom_nmf/__init__.py index 7c16065..0fad239 100644 --- a/src/geom_nmf/__init__.py +++ b/src/geom_nmf/__init__.py @@ -1,7 +1,35 @@ -from importlib.metadata import version +""" +geom_nmf +======== +Geometric NMF: geometric non-negative matrix factorization via maximum-volume simplex fitting. + +Public API +---------- +GeomNMF Main pipeline. +nfindr N-FINDR endmember extraction (standalone). +log_simplex_volume Log-volume of a simplex spanned by K points. +compute_Phi Source attribution matrix from source means and H. +estimate_weights Recover W from Y and H. +permute_to_reference Permute estimated H, W, Phi to best match a reference ordering. +barplot_Phi Grouped bar plot of Phi (% attribution per source and feature). +""" -from ._base import GeomNMF -from . import _viz as viz +from importlib.metadata import version +from .core import GeomNMF, GeomNMFResult +from .nfindr import nfindr +from .endmembers import log_simplex_volume +from .weights import compute_Phi, estimate_weights +from .utils import permute_to_reference +from .plot import barplot_Phi __version__ = version("geom-nmf") -__all__ = ["GeomNMF", "viz"] +__all__ = [ + "GeomNMF", + "GeomNMFResult", + "nfindr", + "log_simplex_volume", + "compute_Phi", + "estimate_weights", + "permute_to_reference", + "barplot_Phi" +] diff --git a/src/geom_nmf/_base.py b/src/geom_nmf/_base.py deleted file mode 100644 index 55bc15f..0000000 --- a/src/geom_nmf/_base.py +++ /dev/null @@ -1,65 +0,0 @@ -import numpy as np -from sklearn.base import BaseEstimator, RegressorMixin -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted - - -class GeomNMF(BaseEstimator, RegressorMixin): - """Geospatial Non-negative Matrix Factorization regressor. - - Parameters - ---------- - n_components : int, default=10 - Number of latent components (NMF rank). - max_iter : int, default=200 - Maximum number of iterations. - tol : float, default=1e-4 - Convergence tolerance. - random_state : int or None, default=None - Random seed for reproducibility. - """ - - def __init__(self, n_components=10, max_iter=200, tol=1e-4, random_state=None): - self.n_components = n_components - self.max_iter = max_iter - self.tol = tol - self.random_state = random_state - - def fit(self, X, y): - """Fit the model to training data. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training input samples. - y : array-like of shape (n_samples,) - Target values. - - Returns - ------- - self : GeomNMF - Fitted estimator. - """ - X, y = check_X_y(X, y) - - self.n_features_in_ = X.shape[1] - self.is_fitted_ = True - - return self - - def predict(self, X): - """Predict target values for input samples. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input samples. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Predicted values. - """ - check_is_fitted(self) - X = check_array(X) - - raise NotImplementedError("predict() is not yet implemented.") diff --git a/src/geom_nmf/_viz.py b/src/geom_nmf/_viz.py deleted file mode 100644 index e9c6e08..0000000 --- a/src/geom_nmf/_viz.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np - - -def plot_components(model, feature_names=None, ax=None): - """Plot the learned NMF components (basis vectors). - - Parameters - ---------- - model : GeomNMF - A fitted GeomNMF instance. - feature_names : list of str, optional - Names for each feature/column. - ax : matplotlib.axes.Axes, optional - Axes to plot on. Creates a new figure if not provided. - - Returns - ------- - ax : matplotlib.axes.Axes - """ - raise NotImplementedError - - -def plot_reconstruction_error(model, ax=None): - """Plot reconstruction error over iterations. - - Parameters - ---------- - model : GeomNMF - A fitted GeomNMF instance. - ax : matplotlib.axes.Axes, optional - Axes to plot on. Creates a new figure if not provided. - - Returns - ------- - ax : matplotlib.axes.Axes - """ - raise NotImplementedError - - -def plot_spatial_map(model, coords, component=0, ax=None): - """Plot the spatial distribution of a single NMF component. - - Parameters - ---------- - model : GeomNMF - A fitted GeomNMF instance. - coords : array-like of shape (n_samples, 2) - Longitude/latitude coordinates for each sample. - component : int, default=0 - Index of the component to plot. - ax : matplotlib.axes.Axes, optional - Axes to plot on. Creates a new figure if not provided. - - Returns - ------- - ax : matplotlib.axes.Axes - """ - raise NotImplementedError - - -def plot_predicted_vs_actual(y_true, y_pred, ax=None): - """Scatter plot of predicted vs. actual target values. - - Parameters - ---------- - y_true : array-like of shape (n_samples,) - True target values. - y_pred : array-like of shape (n_samples,) - Predicted target values. - ax : matplotlib.axes.Axes, optional - Axes to plot on. Creates a new figure if not provided. - - Returns - ------- - ax : matplotlib.axes.Axes - """ - raise NotImplementedError diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py new file mode 100644 index 0000000..b42ee0e --- /dev/null +++ b/src/geom_nmf/core.py @@ -0,0 +1,318 @@ +""" +geom_nmf.core +============= +Main GeomNMF pipeline: maximum-volume simplex fitting for geometric NMF. +""" + +import time +from typing import NamedTuple +import numpy as np +import pandas as pd + +from geom_nmf.endmembers import ( + get_hull_candidates, + get_random_direction_candidates, + prune_close_points, + estimate_H, + log_simplex_volume +) +from geom_nmf.weights import estimate_weights, compute_Phi +from geom_nmf.nfindr import nfindr + + +class GeomNMFResult(NamedTuple): + """Return type of :func:`GeomNMF`.""" + H_star: np.ndarray # (K, J) estimated endmembers (row-normalized) + W_tilde: np.ndarray # (n, K) mixing weights scaled to original row sums + mu_tilde: np.ndarray # (K,) mean weight per source + Phi: np.ndarray # (K, J) source attribution matrix + logvol: float # log-volume of the selected simplex + weight_diagnostics: dict # diagnostics from weight recovery (see estimate_weights) + fit_diagnostics: dict # diagnostics from geometry / candidate steps + elapsed: float # total wall-clock run time in seconds + + +def GeomNMF( + Y: np.ndarray | pd.DataFrame, + K: int, + seed: int = 123, + rank_tol: float = 1e-12, + weight_tol: float = 1e-2, + candidate_method: str = "exact", + n_directions: int = 20000, + n_top: int = 1, + n_candidates: int | None = None, + prune: bool = False, + n_clusters: int | None = None, + refine_greedy: bool = False, + verbose: bool = False, +) -> GeomNMFResult: + """ + Geometric non-negative matrix factorization via maximum-volume simplex fitting (GeomNMF). + + Assumes the observed data matrix Y follows a simplex mixing model:: + + Y ≈ W H, W ≥ 0, H ≥ 0, H 1 = 1 + + and estimates H (endmembers) by finding the maximum-volume K-simplex + inscribed in the convex hull of the row-normalized data Y_star. + + Parameters + ---------- + Y : np.ndarray or pd.DataFrame, shape (n, J) + Observed data matrix. Rows are observations; columns are features. + All entries must be nonnegative (the method uses row-normalization). + K : int + Number of sources / endmembers to estimate. + seed : int, optional + Random seed (used for random candidate method and greedy refinement). + Default 123. + rank_tol : float, optional + Singular-value threshold for intrinsic rank determination; values below + this are treated as zero. Default 1e-12. + weight_tol : float, optional + Threshold for flagging simplex constraint violations in + ``weight_diagnostics``; rows where any weight is below ``-weight_tol`` + or the row sum deviates from 1 by more than ``weight_tol`` are flagged. + Default 1e-2. + candidate_method : {"exact", "random"}, optional + How to generate the pool of endmember candidates: + + * ``"exact"`` – convex-hull vertices via QHull (exact but may be + slow in high intrinsic dimension). + * ``"random"`` – random projection extremes (fast approximation; + controlled by *n_directions*, *n_top*, *n_candidates*). + + Default ``"exact"``. + n_directions : int, optional + Number of random directions for ``candidate_method="random"``. + Default 20000. + n_top : int, optional + Number of extreme points recorded per random direction (both ends) + when ``candidate_method="random"``. Default 1. + n_candidates : int or None, optional + Number of candidates to keep after random direction sampling + (highest-frequency first). Default None (keep all that appear). + prune : bool, optional + If True, cluster the candidates and keep one representative per + cluster before the exhaustive volume search. Useful when the + candidate pool is large. Default False. + n_clusters : int or None, optional + Number of clusters when *prune=True*. Defaults to ``10*K``. + refine_greedy : bool, optional + If True, run one pass of N-FINDR greedy swaps on the initial + solution to try to improve the simplex volume. Only meaningful when + ``candidate_method="random"`` or ``prune=True`` (with ``"exact"`` + and no pruning the exhaustive search is already globally optimal over + the hull vertices). Default False. + verbose : bool, optional + Print progress information. Default False. + + Raises + ------ + ValueError + If *Y* contains any NaN or Inf values, any negative values, any + all-zero rows, or if *candidate_method* is not ``"exact"`` or + ``"random"``. + + Returns + ------- + result : GeomNMFResult + A named tuple with the following fields: + + * ``H_star`` – np.ndarray, shape (K, J): estimated endmembers + (rows of row-normalized Y_star). + * ``W_tilde`` – np.ndarray, shape (n, K): estimated mixing weights + scaled back to the original row-sum magnitudes. + * ``mu_tilde`` – np.ndarray, shape (K,): mean weight per source. + * ``Phi`` – np.ndarray, shape (K, J): source attribution matrix + (see :func:`geom_nmf.weights.compute_Phi`). + * ``logvol`` – float: log-volume of the selected simplex. + * ``weight_diagnostics`` – dict: diagnostics from the weight recovery step + (see :func:`geom_nmf.weights.estimate_weights`), with keys: + + - ``"max_row_sum_dev_H"`` – max absolute deviation of H row sums from 1; + should be near zero for a well-formed endmember matrix. + - ``"rank_H_aug"`` – rank of the augmented system ``[H | 1]``; should + equal K for an identifiable model. + - ``"cond_G"`` – condition number of ``H_aug @ H_aug.T``; large values + indicate near-collinear endmembers and amplified weight errors. + - ``"I_err"`` – ``‖H_aug pinv(H_aug) − I‖_∞``; near zero means the + pseudoinverse is numerically accurate. + - ``"aug_resid_inf"`` – ``‖Y_aug − W_raw H_aug‖_∞``; reconstruction + error of the raw (pre-clipping) weights. + - ``"large_neg_rows"`` – row indices of observations where the raw + weight vector has a component below ``−weight_tol``; many such rows + suggest model misspecification. + - ``"large_neg_count"`` – number of rows in ``large_neg_rows``. + - ``"not_sum1_rows"`` – row indices where the raw weight row-sum + deviates from 1 by more than ``weight_tol``. + - ``"not_sum1_count"`` – number of rows in ``not_sum1_rows``. + * ``fit_diagnostics`` – dict: diagnostics from the geometry / candidate + steps, with keys: + + - ``"intrinsic_rank"`` – int: effective affine dimension of the data + (number of singular values above *rank_tol*); should equal K-1 for a + well-posed model. + - ``"n_cand"`` – int: number of candidate points passed to the + exhaustive max-volume search (after optional pruning). + - ``"cand_idx"`` – np.ndarray: indices (into the input *Y*) of the + candidate extreme points identified in Step 1. + - ``"direction_hit_counts"`` – np.ndarray or None: per-observation selection + frequency from random-direction sampling; None when + ``candidate_method="exact"``. + - ``"logvol_before_refinement"`` – float or None: log-volume of the exhaustive + search solution before greedy refinement; None when + ``refine_greedy=False`` or refinement was skipped. + * ``elapsed`` – float: total wall-clock run time in seconds. + """ + t_start = time.time() + + if isinstance(Y, pd.DataFrame): + Y = Y.to_numpy() + Y = np.asarray(Y, dtype=float) + + # check bad data + if not np.isfinite(Y).all(): + bad = np.argwhere(~np.isfinite(Y)) + raise ValueError( + f"Y contains {len(bad)} non-finite value(s) (NaN or Inf). " + "Remove or impute incomplete rows before calling GeomNMF." + ) + if (Y < 0).any(): + bad = np.argwhere(Y < 0) + raise ValueError( + f"Y contains {len(bad)} negative value(s). " + "GeomNMF requires a nonnegative data matrix." + ) + if (Y.sum(axis=1) == 0).any(): + bad_rows = np.flatnonzero(Y.sum(axis=1) == 0) + raise ValueError( + f"Y has {len(bad_rows)} all-zero row(s) (indices: {bad_rows.tolist()}). " + "Row normalization requires each row to have a positive sum." + ) + + # Row sums and normalized data + n = Y.shape[0] + r = Y.sum(axis=1, keepdims=True) + Y_star = Y / r # (n, J) + + # --- Step 0: project to intrinsic affine subspace --- + Y_star_reduced = Y_star[:, :-1].astype(float, copy=False) # (n, J-1) + mean = Y_star_reduced.mean(axis=0, keepdims=True) + Yc = Y_star_reduced - mean + _, S_svd, Vt = np.linalg.svd(Yc, full_matrices=False) + mask = S_svd > rank_tol + basis = Vt[mask].T # (J-1, rank) + Yc_proj = Yc @ basis # (n, rank) + + # --- Step 1: build candidate pool --- + counts = None # only populated for candidate_method="random" + if candidate_method == "exact": + cand_idx = get_hull_candidates(Yc_proj, verbose=verbose) + cand_proj = Yc_proj[cand_idx] + + elif candidate_method == "random": + # Whiten (scale to unit covariance -> remove directional stretch and make all directions comparable) + # intrinsic coordinates before random projection + C_cov = (Yc_proj.T @ Yc_proj) / max(n - 1, 1) + evals, evecs = np.linalg.eigh(C_cov) + evals = np.maximum(evals, rank_tol) + inv_sqrt_cov = evecs @ np.diag(1.0 / np.sqrt(evals)) @ evecs.T + Yc_proj_w = Yc_proj @ inv_sqrt_cov + cand_idx, counts = get_random_direction_candidates( + Yc_proj_w, seed=seed, n_directions=n_directions, n_top=n_top, n_candidates=n_candidates, verbose=verbose, + ) + cand_proj = Yc_proj[cand_idx] # un-whitened: pruning clusters in the same geometry as the volume objective + + else: + raise ValueError("candidate_method must be 'exact' or 'random'.") + + cand_ambient = Y_star[cand_idx] # (m, J) + + # --- Optional: cluster-based pruning --- + if prune: + _n_clusters = 10 * K if n_clusters is None else n_clusters + if _n_clusters >= len(cand_proj): + if verbose: + print( + f"Pruning skipped: candidate count {len(cand_proj)} ≤ n_clusters {_n_clusters}" + ) + else: + _, pruned_idx_local = prune_close_points(cand_proj, n_clusters=_n_clusters, seed=seed) + cand_ambient = cand_ambient[pruned_idx_local] + cand_idx = cand_idx[pruned_idx_local] + if verbose: + print(f"Number of pruned candidates: {len(pruned_idx_local)}") + + H_candidates = cand_ambient + + # --- Step 2: exhaustive max-volume search over candidates --- + H_star_hat, logvol_hat = estimate_H( + H_candidates, K, verbose=verbose + ) + + # --- Optional: N-FINDR greedy refinement --- + logvol_before = None + if refine_greedy: + if candidate_method == "exact" and not prune: + if verbose: + print( + "Greedy refinement skipped: exact hull without pruning is " + "already globally optimal over hull vertices." + ) + else: + logvol_before = logvol_hat + diffs = Y_star[:, np.newaxis, :] - H_star_hat[np.newaxis, :, :] # (n, K, J) + init_idx = np.argmin((diffs ** 2).sum(axis=2), axis=0) # (K,) + + _, refined_idx = nfindr( + Y_star, K, normalize=False, init_idx=init_idx, seed=seed, + ) + H_refined = Y_star[refined_idx] + logvol_refined, _ = log_simplex_volume(H_refined, rank_tol=rank_tol) + + if logvol_refined > logvol_before: + H_star_hat = H_refined + logvol_hat = logvol_refined + if verbose: + vol_ratio = np.exp(logvol_refined - logvol_before) + print( + f"Greedy refinement accepted; log-vol " + f"{logvol_before:.4f} → {logvol_refined:.4f} " + f"(volume ratio: {vol_ratio:.3f}x)" + ) + else: + if verbose: + print( + f"Greedy refinement rejected; refined log-vol " + f"{logvol_refined:.4f} did not improve over " + f"{logvol_before:.4f}" + ) + + # --- Step 3: recover mixing weights --- + W_star_hat, weight_diagnostics = estimate_weights(Y_star, H_star_hat, weight_tol=weight_tol, verbose=verbose) + W_tilde_hat = W_star_hat * r + mu_tilde_hat = W_tilde_hat.mean(axis=0) + Phi_hat = compute_Phi(mu_tilde_hat, H_star_hat) + + elapsed = time.time() - t_start + + fit_diagnostics = { + "intrinsic_rank": int(mask.sum()), + "n_cand": len(H_candidates), + "cand_idx": cand_idx, + "direction_hit_counts": counts, # np.ndarray if candidate_method="random", else None + "logvol_before_refinement": logvol_before, # float if refine_greedy ran, else None + } + + return GeomNMFResult( + H_star=H_star_hat, + W_tilde=W_tilde_hat, + mu_tilde=mu_tilde_hat, + Phi=Phi_hat, + logvol=logvol_hat, + weight_diagnostics=weight_diagnostics, + fit_diagnostics=fit_diagnostics, + elapsed=elapsed, + ) diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py new file mode 100644 index 0000000..8bee0d3 --- /dev/null +++ b/src/geom_nmf/endmembers.py @@ -0,0 +1,327 @@ +""" +geom_nmf.endmembers +=================== +Candidate endmember generation: convex-hull vertices (exact) or random +projection extremes (approximate), plus clustering-based pruning. +Volume scoring and exhaustive maximum-volume simplex search over a candidate set of endmember points. +""" + +import time +import numpy as np +from itertools import combinations +from scipy.special import gammaln, comb + +def get_hull_candidates( + Yc_proj: np.ndarray, + verbose: bool = False, +) -> np.ndarray: + """ + Return convex-hull vertex indices of *Yc_proj* via QHull. + + Parameters + ---------- + Yc_proj : np.ndarray, shape (n, r) + Centred, intrinsic-space projections of the data. + verbose : bool, optional + Print timing and candidate count. Default False. + + Returns + ------- + cand_idx : np.ndarray, shape (m,) + Indices (into *Yc_proj*) of the convex-hull vertices. + """ + from scipy.spatial import ConvexHull + + if verbose: + print("Computing convex hull...", end="", flush=True) + + start = time.time() + hull = ConvexHull(Yc_proj, qhull_options="Qx Qt Q12 Pp") + + if verbose: + print(f" done in {time.time() - start:.2f}s; #cands={len(hull.vertices)}") + + return hull.vertices + + +def get_random_direction_candidates( + Yc_proj_w: np.ndarray, + seed: int = 123, + n_directions: int = 20000, + n_top: int = 1, + n_candidates: int | None = None, + verbose: bool = False, +) -> tuple[np.ndarray, np.ndarray]: + """ + Find extreme points by projecting onto many random directions (vectorized). + + For each random unit vector u the *n_top* largest **and** smallest dot + products ``Yc_proj_w @ u`` are recorded. Points that appear as extremes + most often are the most likely simplex vertices. + + Parameters + ---------- + Yc_proj_w : np.ndarray, shape (n, r) + Whitened, centered projections of the data in intrinsic space. + seed : int, optional + Random seed for reproducibility. Default 123. + n_directions : int, optional + Number of random directions to sample. Default 20000. + n_top : int, optional + How many extreme points to record per direction (both ends). + Default 1. + n_candidates : int or None, optional + If given, return at most this many candidate indices (highest counts + first). Default None (return all that were ever selected). + verbose : bool, optional + Print timing and candidate count. Default False. + + Returns + ------- + chosen_sorted : np.ndarray, shape (m,) + Indices (into *Yc_proj_w*) of candidate extreme points, sorted by + descending selection frequency. + counts : np.ndarray, shape (n,) + Full frequency array for all n points. + """ + X = np.asarray(Yc_proj_w, dtype=float) + n, r = X.shape + rng = np.random.default_rng(seed) + counts = np.zeros(n, dtype=np.intp) + + if verbose: + print( + f"Sampling {n_top} extreme points in each of {n_directions} random directions...", + end="", + flush=True, + ) + start = time.time() + + batch = min(n_directions, 2048) + remaining = n_directions + while remaining > 0: + b = min(batch, remaining) + remaining -= b + + U = rng.standard_normal((r, b)) + U /= np.linalg.norm(U, axis=0, keepdims=True) + 1e-12 + + S = X @ U # (n, b) + + if n_top == 1: + np.add.at(counts, S.argmax(axis=0), 1) + np.add.at(counts, S.argmin(axis=0), 1) + else: + St = S.T # (b, n) + hi_idx = np.argpartition(St, -n_top, axis=1)[:, -n_top:] + lo_idx = np.argpartition(St, n_top, axis=1)[:, :n_top] + np.add.at(counts, hi_idx.ravel(), 1) + np.add.at(counts, lo_idx.ravel(), 1) + + chosen = np.flatnonzero(counts > 0) + chosen_sorted = chosen[np.argsort(counts[chosen])[::-1]] # min(n, 2n_directions x n_top) candidates, sorted by count + + if n_candidates is not None: + chosen_sorted = chosen_sorted[:n_candidates] # min(n, 2n_directions x n_top, n_candidates) candidates + + if verbose: + print(f" done in {time.time() - start:.2f}s; #cands={len(chosen_sorted)}") + + return chosen_sorted, counts + +def prune_close_points( + points: np.ndarray, + n_clusters: int = 25, + seed: int = 123, +) -> tuple[np.ndarray, list[int]]: + """ + Cluster *points* and return one representative per cluster. + + The representative is the cluster member closest (in Euclidean distance) + to the cluster centroid, so the returned points are always actual data + points (not interpolated centroids). + + The number of clusters is chosen automatically by maximizing the + silhouette score over the range + ``[max(2, n_clusters // 2), min(2 * n_clusters, m - 1)]``, + centered around *n_clusters*. + + Parameters + ---------- + points : np.ndarray, shape (m, J) + Points to prune. Must be finite (no NaN or Inf). + n_clusters : int, optional + Target number of representatives (clusters) to retain after pruning. The + silhouette search explores a range around this value, so the actual + number returned may differ slightly. Default 25. + seed : int, optional + Random seed passed to MiniBatchKMeans. Default 123. + + Returns + ------- + representatives : np.ndarray, shape (n_clusters_out, J) + Selected representative points. + selected_indices : list of int + Row indices into *points* for each representative. + """ + X = np.asarray(points, dtype=float) + m = X.shape[0] + + if m == 0: + return X, [] + if m == 1: + return X.copy(), [0] + + # try to import sklearn lazily; if missing, skip pruning safely. + try: + from sklearn.cluster import MiniBatchKMeans + from sklearn.metrics import silhouette_score + except ImportError: + return X, list(range(m)) + + lower = max(2, n_clusters // 2) + upper = min(2 * n_clusters, m - 1) + + if lower > upper: + return X, list(range(m)) + + best_k, best_s = lower, -np.inf + for k in range(lower, upper + 1): + km = MiniBatchKMeans(n_clusters=k, random_state=seed, n_init=10, batch_size=4096) + labels = km.fit_predict(X) + if len(np.unique(labels)) < 2: + continue + s = silhouette_score( + X, labels, + metric="euclidean", + sample_size=min(10000, m), + random_state=seed, + ) + if s > best_s: + best_s, best_k = s, k + + km = MiniBatchKMeans(n_clusters=best_k, random_state=seed, n_init=10, batch_size=4096) + labels = km.fit_predict(X) + centers = km.cluster_centers_ + + selected = [] + for k in range(best_k): + members = np.flatnonzero(labels == k) + if len(members) == 0: + continue + diffs = X[members] - centers[k] + nearest = members[np.einsum("ij,ij->i", diffs, diffs).argmin()] + selected.append(int(nearest)) + + selected = sorted(set(selected)) + return X[selected], selected + + +def log_simplex_volume( + subset: np.ndarray, + rank_tol: float = 1e-12, +) -> tuple[float, int]: + """ + Compute the log-volume of the (K-1)-simplex spanned by *subset*. + + The volume is defined as:: + + vol = (∏ sᵢ) / r! + + where sᵢ are the nonzero singular values of the edge matrix + ``subset[1:] - subset[0]`` and r is the affine rank. + + Parameters + ---------- + subset : np.ndarray, shape (K, J) + K points in J-dimensional ambient space. + rank_tol : float, optional + Singular-value threshold below which a dimension is considered zero. + Default 1e-12. + + Returns + ------- + log_vol : float + Natural log of the simplex volume, or ``-np.inf`` if degenerate. + r : int + Affine rank of *subset* (number of non-negligible singular values). + """ + base = subset[0] + A = subset[1:] - base # edge matrix, shape (K-1, J) + _, S, _ = np.linalg.svd(A, full_matrices=False) + + mask = S > rank_tol + r = int(mask.sum()) + + if r < A.shape[0]: # degenerate simplex + return -np.inf, r + + log_vol = float(np.log(S[mask]).sum() - gammaln(r + 1)) + return log_vol, r + + +def estimate_H( + hull_pts: np.ndarray, + K: int, + verbose: bool = False, +) -> tuple[np.ndarray, float]: + """ + Find the K-point subset of *hull_pts* that maximizes simplex log-volume. + + Exhaustively iterates over all C(m, K) combinations. Feasible when + the number of hull / candidate points is small (≲ a few hundred). + + Parameters + ---------- + hull_pts : np.ndarray, shape (m, J) + Candidate points (convex-hull vertices or random-direction extremes) + in the ambient endmember space. + K : int + Number of endmembers (simplex vertices) to select. + verbose : bool, optional + If True, prints the number of combinations and shows a tqdm progress + bar (requires tqdm to be installed). Default False. + + Returns + ------- + H_hat_best : np.ndarray, shape (K, J) + The K rows of *hull_pts* that achieve the maximum log-volume. + best_logvol : float + Corresponding log-volume value. + + Raises + ------ + RuntimeError + If every K-subset of *hull_pts* forms a degenerate simplex (zero + volume), e.g. when all candidates are collinear. + """ + m = int(hull_pts.shape[0]) + total = int(comb(m, K, exact=True)) + if verbose: + print(f"H candidates={m}, K={K}, combinations={total:,}") + + it = combinations(range(m), K) + if verbose: + try: + from tqdm import tqdm as _tqdm + it = _tqdm(it, total=total, desc=f"Searching K={K} subsets for max volume") + except ImportError: + pass + + best_logvol = -np.inf + best_inds: tuple | None = None + + for inds in it: + logvol, _ = log_simplex_volume(hull_pts[list(inds)]) + if logvol > best_logvol: + best_logvol = logvol + best_inds = inds + + if best_inds is None: + raise RuntimeError( + f"No non-degenerate K={K}-simplex found among the {m} candidate points. " + "The candidate set may be collinear or too small." + ) + + H_hat_best = hull_pts[list(best_inds)] + return H_hat_best, float(best_logvol) diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py new file mode 100644 index 0000000..8ba7d79 --- /dev/null +++ b/src/geom_nmf/nfindr.py @@ -0,0 +1,319 @@ +""" +geom_nmf.nfindr +=============== +N-FINDR endmember extraction (NumPy-only), based on Winter (1999) with +documented modifications. + +Reference +--------- +Winter, M. E. (1999). N-FINDR: An algorithm for fast autonomous spectral +end-member determination in hyperspectral data. *Proc. SPIE 3753*. +""" + +import math +import numpy as np +import pandas as pd + + +# --------------------------------------------------------------------------- +# Private helpers +# --------------------------------------------------------------------------- + +def _pca_reduce( + Y: np.ndarray, + d: int, +) -> np.ndarray: + """ + Center *Y* and project onto its top *d* principal components. + + Parameters + ---------- + Y : np.ndarray, shape (n, J) + Input data matrix. + d : int + Number of principal components to retain. + + Returns + ------- + Z : np.ndarray, shape (n, d) + Projected coordinates. + """ + Yc = Y - Y.mean(axis=0, keepdims=True) + U, S, _ = np.linalg.svd(Yc, full_matrices=False) + return U[:, :d] * S[:d] + +def _build_simplex_matrix( + Z: np.ndarray, + idx: np.ndarray, + K: int, +) -> tuple[np.ndarray, float]: + """ + Build the augmented simplex matrix M and compute its volume. + + Parameters + ---------- + Z : np.ndarray, shape (n, K-1) + Data in the reduced PCA space. + idx : np.ndarray, shape (K,) + Indices of the K simplex vertices into Z. + K : int + Number of vertices. + + Returns + ------- + M : np.ndarray, shape (K, K) + Augmented matrix [[Z[idx].T], [1...1]]. + vol : float + Simplex volume |det(M)| / (K-1)! + """ + M = np.vstack([Z[idx].T, np.ones((1, K))]) + vol = abs(float(np.linalg.det(M))) / math.factorial(K - 1) + return M, vol + +def _init_indices_random( + n: int, + K: int, + rng: np.random.Generator, +) -> np.ndarray: + """ + Draw *K* distinct indices uniformly at random. + + Parameters + ---------- + n : int + Total number of samples. + K : int + Number of indices to draw. + rng : np.random.Generator + NumPy random generator instance. + + Returns + ------- + idx : np.ndarray, shape (K,), dtype int + Selected indices (no duplicates). + """ + return rng.choice(n, size=K, replace=False) + + +def _init_indices_atgp( + Z: np.ndarray, + K: int, +) -> np.ndarray: + """ + ATGP-style greedy initialization in the reduced PCA space. + + Iteratively adds the point whose residual (after projecting out the + subspace spanned by all current selections) has the largest norm. + + Parameters + ---------- + Z : np.ndarray, shape (n, d) + Data in the reduced (K-1)-dimensional PCA space. + K : int + Number of endmembers to initialise. + + Returns + ------- + idx : np.ndarray, shape (K,), dtype int + Indices of the K initialization points. + """ + idx: list[int] = [] + norms = np.einsum("ij,ij->i", Z, Z) + idx.append(int(np.argmax(norms))) + + for _ in range(1, K): + # Q: (d, m=len(idx)) orthonormal basis for the row space of Z[idx] + Q, _ = np.linalg.qr(Z[idx].T) + # project each row of Z onto the subspace spanned by current selections + proj = Z @ (Q @ Q.T) + resid_sq = np.einsum("ij,ij->i", Z - proj, Z - proj) + resid_sq[idx] = -np.inf + # add the point with the largest orthogonal residual (most unexplained) + idx.append(int(np.argmax(resid_sq))) + + return np.array(idx, dtype=int) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def nfindr( + Y: np.ndarray | pd.DataFrame, + K: int, + max_iter: int = 5, + seed: int | None = None, + normalize: bool = True, + init: str = "atgp", + init_idx: np.ndarray | None = None, +) -> tuple[np.ndarray, np.ndarray]: + """ + N-FINDR endmember extraction (NumPy-only, no SciPy). + + Finds K endmembers by iteratively replacing each current endmember + candidate with the pixel that maximizes the simplex volume in the + (K-1)-dimensional PCA-reduced space. + + Parameters + ---------- + Y : np.ndarray or pd.DataFrame, shape (n, J) or (rows, cols, J) + Input spectral data. Can be a 2-D matrix or a 3-D hyperspectral + cube; the cube is reshaped to (rows*cols, J) internally. + All entries should be nonnegative (typical for spectral data). + K : int + Number of endmembers to extract. Must satisfy ``2 ≤ K ≤ J + 1``. + max_iter : int, optional + Maximum number of full replacement sweeps (each sweep considers + swapping every one of the K current endmembers with every pixel). + Iteration stops early if no improvement is found. Default 5. + seed : int or None, optional + Random seed for reproducibility; used only when ``init="random"``. + Default None. + normalize : bool, optional + If True, L2-normalize each spectrum (row) before PCA projection. + Default True. + init : {"atgp", "random"}, optional + Initialisation strategy when *init_idx* is not provided: + + * ``"atgp"`` – greedy ATGP initialization. + Recommended; differs from Winter (1999)'s original random init. + * ``"random"`` – K indices drawn uniformly at random (original + Winter (1999) behavior). + + Default ``"atgp"``. + init_idx : array-like of int, shape (K,) or None, optional + If supplied, use these K indices as the starting endmember set and + ignore *init*. All indices must be distinct and in ``[0, n)``. + Default None. + + Returns + ------- + E : np.ndarray, shape (K, J) + Extracted endmember spectra in the original feature space (rows of + the **unnormalized** *Y*). + idx : np.ndarray, shape (K,), dtype int + Indices of the selected pixels in the flattened data array. + + Raises + ------ + ValueError + If *Y* is not 2-D or 3-D, if K is out of range, if n < K, or if + *init_idx* is invalid (wrong shape, duplicates, or out-of-bounds + indices). + + Notes + ----- + Deviations from Winter (1999): + + * Defaults to ATGP initialization (Plaza et al., 2002) instead of + random; pass ``init="random"`` to recover the original behavior. + * Optional L2 row-normalization before PCA (``normalize=True`` by + default); Winter (1999) does not normalize. + * The inner replacement loop is vectorized via the matrix-determinant + lemma, scoring all n candidate pixels at once per vertex. This is + mathematically equivalent to the original but O(N·K²) per vertex + instead of O(N·K³). A fallback to explicit volume recomputation + is used when the current simplex is numerically degenerate. + + Volume formula (for reference):: + + vol = |det(M)| / (K-1)!, M = [[Z.T], [1...1]] ∈ R^{K×K} + """ + + # process data to a working matrix + if isinstance(Y, pd.DataFrame): + Y = Y.to_numpy() + + if Y.ndim == 3: + rows, cols, J = Y.shape + Y2 = Y.reshape(rows * cols, J) + elif Y.ndim == 2: + Y2 = Y + else: + raise ValueError("Y must be 2-D (n, J) or 3-D (rows, cols, bands).") + + n, J = Y2.shape + + if normalize: + norms = np.linalg.norm(Y2, axis=1, keepdims=True) + norms[norms == 0.0] = 1.0 + Yn = Y2 / norms + else: + Yn = Y2 + + if K < 2: + raise ValueError(f"K must be at least 2 (a single endmember has zero simplex volume); got K={K}.") + if K > J + 1: + raise ValueError(f"K must be ≤ J+1={J + 1} (PCA reduces to K-1={K - 1} dims but data has only J={J} features); got K={K}.") + if n < K: + raise ValueError(f"Need at least K={K} observations to form a simplex; got n={n}.") + + Z = _pca_reduce(Yn, d=K - 1) # (n, K-1) + + # initialize K indices + rng = np.random.default_rng(seed) + if init_idx is not None: + idx = np.asarray(init_idx, dtype=int) + if idx.shape != (K,): + raise ValueError( + f"init_idx must have exactly K={K} entries; got shape {idx.shape}." + ) + if len(set(idx.tolist())) != K: + raise ValueError("init_idx must not contain duplicate indices.") + if idx.min() < 0 or idx.max() >= n: + raise ValueError(f"init_idx entries must lie in [0, n={n}).") + idx = idx.copy() + elif init == "atgp": + idx = _init_indices_atgp(Z, K) + elif init == "random": + idx = _init_indices_random(n, K, rng) + else: + raise ValueError("init must be 'atgp' or 'random'.") + + M, best_vol = _build_simplex_matrix(Z, idx, K) + + for _ in range(max_iter): + improved = False + for j in range(K): + locked = set(idx.tolist()) - {idx[j]} + det_M = np.linalg.det(M) + if abs(det_M) < 1e-300: + # Degenerate — fall back to explicit volume recomputation + best_j = idx[j] + trial = idx.copy() + for cand in range(n): + if cand in locked: + continue + trial[j] = cand + _, v = _build_simplex_matrix(Z, trial, K) + if v > best_vol: + best_vol = v + best_j = cand + if best_j != idx[j]: + idx[j] = best_j + M, best_vol = _build_simplex_matrix(Z, idx, K) # safer to recompute + improved = True + continue + + # Vectorized scoring via matrix-determinant lemma: + # det(M_new) = det(M) + (c_n - M[:,j]) · adj(M)[:,j] + # where adj(M)[:,j] = det(M) * (M^{-1})[j,:] + M_inv = np.linalg.inv(M) + adj_col_j = det_M * M_inv[j, :] # (K,) + M_j_dot = float(M[:, j] @ adj_col_j) + scores = Z @ adj_col_j[:-1] + adj_col_j[-1] # (n,) + new_dets = det_M + (scores - M_j_dot) + new_vols = np.abs(new_dets) / math.factorial(K - 1) + + new_vols[np.array(list(locked), dtype=int)] = -np.inf + + best_j = int(np.argmax(new_vols)) + if new_vols[best_j] > best_vol: + idx[j] = best_j + M, best_vol = _build_simplex_matrix(Z, idx, K) # safer to recompute + improved = True + + if not improved: + break + + return Y2[idx, :], idx diff --git a/src/geom_nmf/plot.py b/src/geom_nmf/plot.py new file mode 100644 index 0000000..7153341 --- /dev/null +++ b/src/geom_nmf/plot.py @@ -0,0 +1,128 @@ +""" +geom_nmf.plot +============= +Plotting helpers for GeomNMF results. +""" + +import numpy as np +import matplotlib.pyplot as plt + + +def barplot_Phi( + Phi: np.ndarray, + title: str = "Estimated source attribution (GeomNMF)", + feature_labels: list[str] | None = None, + feature_colors = None, + show_err: bool = True, + savepath: str | None = None, +) -> None: + """ + Grouped bar plot of the source attribution matrix Phi. + + Each group of bars corresponds to one source (k); within each group + there is one bar per feature (j), with height ``Phi[k, j] * 100`` (%). + When a bootstrap stack is provided, bars show the mean across replicates + and optional error bars show ±1 standard deviation. + + Parameters + ---------- + Phi : array-like + Either: + + * shape ``(K, J)``: single Phi matrix. + * shape ``(n_reps, K, J)``: bootstrap stack; mean and std are + computed across the first axis. + + Values are assumed to be in [0, 1] and are multiplied by 100 for + display. + title : str, optional + Figure title. Default ``"Estimated source attribution (GeomNMF)"``. + feature_labels : list[str] or None, optional + Feature labels of length J used in the legend. + If None, defaults to ``["Feature 1", ..., "Feature J"]``. + feature_colors : None, list, or dict, optional + Bar colors per feature. None uses matplotlib defaults; a list of + length J assigns colors by position; a dict maps feature label to + color. Default None. + show_err : bool, optional + If True and *Phi* is 3D, draw error bars (±1 std across bootstrap + reps). Default True. + savepath : str or None, optional + File path to save the figure (200 dpi, tight bbox). Default None. + + + Returns + ------- + None + Displays the figure (and saves it if *savepath* is given). + + Raises + ------ + ValueError + If *Phi* is not 2D or 3D, or if *feature_labels* has the wrong + length. + """ + A = np.asarray(Phi) + if A.ndim == 2: + K, J = A.shape # (K, J) + Phi_mean = A * 100 # in % + Phi_std = None + elif A.ndim == 3: + n_reps, K, J = A.shape # (n_reps, K, J) + Phi_mean = np.nanmean(A, axis=0) * 100 # (K, J), in % + Phi_std = np.nanstd(A, axis=0, ddof=1 if n_reps > 1 else 0) * 100 if show_err else None + else: + raise ValueError(f"Expected 2D (K, J) or 3D (n_reps, K, J); got shape {A.shape}") + + # feature labels + if feature_labels is None: + labels = [f"Feature {j+1}" for j in range(J)] + else: + if len(feature_labels) != J: + raise ValueError(f"feature_labels must have length {J} (got {len(feature_labels)})") + labels = list(feature_labels) + + x = np.arange(K) # one tick per source + width = 0.8 / max(J, 1) # bar width within each source group + + fig, ax = plt.subplots(figsize=(10, 5)) + + for j in range(J): + # build error bar kwargs only if we have usable stds + kwargs = {} + if Phi_std is not None: + yerr = Phi_std[:, j] # (K,) std for feature j across sources + if np.any(np.isfinite(yerr)): + kwargs["yerr"] = yerr + kwargs["error_kw"] = {"elinewidth": 1.2, "capsize": 3} + + # choose color for this feature + if feature_colors is None: + c = None + elif isinstance(feature_colors, dict): + c = feature_colors.get(labels[j], None) + else: + c = feature_colors[j] + + ax.bar( + x + j * width, + Phi_mean[:, j], # (K,) attribution of feature j per source + width, + color=c, + label=labels[j], + **kwargs, + ) + + ax.set_xticks(x + (J - 1) * width / 2) + ax.set_xticklabels([f"Source {k+1}" for k in range(K)]) + ax.set_ylabel("Expected % of pollutants\nattributable to each source") + ax.set_title(title) + ax.set_ylim(0, 100) + + # Legend outside on the right + ax.legend(title="", loc='center left', bbox_to_anchor=(1, 0.5), frameon=False) + plt.tight_layout() + + if savepath: + fig.savefig(savepath, dpi=200, bbox_inches="tight") + plt.show() diff --git a/src/geom_nmf/utils.py b/src/geom_nmf/utils.py new file mode 100644 index 0000000..76999d0 --- /dev/null +++ b/src/geom_nmf/utils.py @@ -0,0 +1,58 @@ +""" +geom_nmf.utils +============== +Algorithmic utilities: endmember permutation matching +""" + +import numpy as np +from scipy.optimize import linear_sum_assignment + +def permute_to_reference( + H_ref: np.ndarray, + H: np.ndarray, + mu: np.ndarray, + Phi: np.ndarray, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + Permute estimated endmembers to best match a reference ordering. + + Uses the Hungarian algorithm to find the permutation of rows of *H* + that minimizes total L2 distance to *H_ref*, then applies the same + permutation to *mu* and *Phi*. + + Parameters + ---------- + H_ref : np.ndarray, shape (K, J) + Reference endmember matrix; rows are sources. + H : np.ndarray, shape (K, J) + Estimated endmember matrix whose rows are to be permuted. + mu : np.ndarray, shape (K,) + Estimated mean weight per source; permuted alongside H. + Phi : np.ndarray, shape (K, J) + Source attribution matrix (rows are sources, columns are features); + permuted alongside H. + + Returns + ------- + H_perm : np.ndarray, shape (K, J) + Permuted endmember matrix aligned to H_ref. + mu_perm : np.ndarray, shape (K,) + Permuted mean weights. + Phi_perm : np.ndarray, shape (K, J) + Permuted source attribution matrix. + order : np.ndarray, shape (K,), dtype int + Permutation indices such that ``H_perm = H[order]``. + """ + H_ref = np.asarray(H_ref) + H = np.asarray(H) + mu = np.asarray(mu) + Phi = np.asarray(Phi) + + # pairwise L2 cost: cost[i, j] = ||H_ref[i] - H[j]|| + cost = np.linalg.norm(H_ref[:, None, :] - H[None, :, :], axis=2) # (K_ref, K) + row_ind, col_ind = linear_sum_assignment(cost) + + # reorder so that estimated rows align with H_ref row order + order = col_ind[np.argsort(row_ind)] + + return H[order], mu[order], Phi[order], order \ No newline at end of file diff --git a/src/geom_nmf/weights.py b/src/geom_nmf/weights.py new file mode 100644 index 0000000..d812c7a --- /dev/null +++ b/src/geom_nmf/weights.py @@ -0,0 +1,146 @@ +""" +geom_nmf.weights +================ +Intensity (mixing weight) recovery and source attribution matrix computation. +""" + +import warnings +import numpy as np + + +def compute_Phi( + mu: np.ndarray, + H: np.ndarray, +) -> np.ndarray: + """ + Compute the source attribution matrix Phi from source means and H. + + Each entry ``Phi[k, j]`` is the fraction of feature j attributable to + source k, weighted by the source mean abundance:: + + Phi[k, j] = mu[k] * H[k, j] / sum_k'(mu[k'] * H[k', j]) + + Parameters + ---------- + mu : np.ndarray, shape (K,) + Mean intensity of each of the K sources. + H : np.ndarray, shape (K, J) + Endmember matrix; rows are source profiles, columns are features. + + Returns + ------- + Phi : np.ndarray, shape (K, J) + Source composition matrix; columns sum to 1 over sources. + + Notes + ----- + If all sources contribute zero to a feature (column of ``mu[:, None] * H`` + sums to zero), the corresponding column of *Phi* will be NaN. + """ + numerator = mu[:, None] * H # (K, J) + denominator = numerator.sum(axis=0) # (J,) + + Phi = numerator / denominator # (K, J) + return Phi + + +def estimate_weights( + Y: np.ndarray, + H: np.ndarray, + weight_tol: float = 1e-2, + verbose: bool = False, +) -> tuple[np.ndarray, dict]: + """ + Solve weights (intensity) W such that Y ≈ W H under the affine sum-to-one + constraint, via the right pseudoinverse of the augmented system [H | 1]. + + After computing ``W_raw = Y_aug @ pinv(H*_aug)`` the rows are projected + onto the probability simplex by clipping negatives and renormalizing + because rows of Y and rows of H are assumed to sum to 1. + + Parameters + ---------- + Y : np.ndarray, shape (n, J) + Row-normalized observed data matrix; rows are observations, columns are features. + H : np.ndarray, shape (K, J) + Row-stochastic endmember matrix; rows are source profiles. + weight_tol : float, optional + Threshold for flagging rows with large simplex violations (negative + weights or sum-deviation above this value). Default 1e-2. + verbose : bool, optional + Print diagnostic statistics. Default False. + + Returns + ------- + W : np.ndarray, shape (n, K) + Mixing weights; nonnegative, rows sum to 1. + diag : dict + Diagnostic information: + + * ``"max_row_sum_dev_H"`` – max deviation of H row sums from 1 + * ``"rank_H_aug"`` – rank of augmented [H | 1] + * ``"cond_G"`` – condition number of H_aug @ H_aug.T; large values + mean small errors in H_aug are amplified into large errors in + H_aug_R and therefore in W_raw + * ``"I_err"`` – ``||H_aug pinv(H_aug) - I||_inf`` + * ``"aug_resid_inf"`` – ``||Y_aug - W_raw H_aug||_inf`` + * ``"large_neg_rows"`` – row indices where min W_raw < -weight_tol + * ``"large_neg_count"`` – number of such rows + * ``"not_sum1_rows"`` – row indices where row sum of W_raw deviates from 1 + by more than weight_tol + * ``"not_sum1_count"`` – number of such rows + + """ + H_in = np.asarray(H, dtype=float) + K, J = H_in.shape + n = Y.shape[0] + + # augment to encode sum-to-one constraint + H_aug = np.hstack([H_in, np.ones((K, 1))]) # (K, J+1) + Y_aug = np.hstack([Y, np.ones((n, 1))]) # (n, J+1) + + # right inverse via SVD pseudoinverse + H_aug_R = np.linalg.pinv(H_aug) # (J+1, K) + + # raw intensity + W_raw = Y_aug @ H_aug_R # (n, K) + + # diagnostics + neg_mask = W_raw.min(axis=1) < -weight_tol # large negative weights + sum1_mask = np.abs(W_raw.sum(axis=1) - 1.0) > weight_tol # row sum deviates from 1 + + G = H_aug @ H_aug.T + try: + condG = float(np.linalg.cond(G)) # large cond_G => small errors in H_aug amplified into large erros in H_aug_R + except np.linalg.LinAlgError: + condG = float("inf") + I_err = float(np.linalg.norm(H_aug @ H_aug_R - np.eye(K), ord=np.inf)) + aug_resid = float(np.linalg.norm(Y_aug - W_raw @ H_aug, ord=np.inf)) + + if verbose: + print(f"||H_aug H_aug_R - I||_inf: {I_err:.3e}") + + diag = { + "max_row_sum_dev_H": float(np.max(np.abs(H_in.sum(axis=1) - 1.0))), + "rank_H_aug": int(np.linalg.matrix_rank(H_aug)), + "cond_G": condG, + "I_err": I_err, + "aug_resid_inf": aug_resid, + "large_neg_rows": np.flatnonzero(neg_mask), + "large_neg_count": int(neg_mask.sum()), + "not_sum1_rows": np.flatnonzero(sum1_mask), + "not_sum1_count": int(sum1_mask.sum()), + } + + # clip negatives and renormalize. + # note: if Y and H are both row stochastic and the model is well-specified, + # W_raw is already row stochastic and this is a no-oeration beyond numerical noise. + # under misspecification, clipping introduces bias — use diag to assess severity. + W = np.maximum(W_raw, 0.0) + s = W.sum(axis=1, keepdims=True) + # s == 0 only if every weight in a row is negative — degenerate, shouldn't + # occur when Y and H are row stochastic; guard defensively + s[s == 0.0] = 1.0 + W /= s + + return W, diag diff --git a/tests/test_basics.py b/tests/test_basics.py deleted file mode 100644 index 91b70b5..0000000 --- a/tests/test_basics.py +++ /dev/null @@ -1,5 +0,0 @@ -from geom_nmf import __version__ - - -def test_version(): - assert __version__ == '0.1.0a0' diff --git a/tests/test_model.py b/tests/test_model.py index 71d84fe..fc7b28a 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,74 +1,71 @@ +""" +Basic smoke test for GeomNMF and barplot_Phi. + +Generates synthetic data from a known K=3 simplex mixing model, +runs GeomNMF, and plots the estimated source attribution matrix. +""" + import numpy as np import pytest -from geom_nmf import GeomNMF +from geom_nmf import GeomNMF, barplot_Phi, permute_to_reference + +SEED = 123 +K, J, N = 3, 8, 500 + +@pytest.fixture +def synthetic_data(): + rng = np.random.default_rng(SEED) + H_true = rng.dirichlet(np.ones(J), size=K) # (K, J) row-stochastic endmembers + W_true = rng.dirichlet(np.ones(K), size=N) # (N, K) row-stochastic weights + Y = W_true @ H_true # (N, J) observed data + return Y, H_true, W_true @pytest.fixture -def sample_data(): - rng = np.random.default_rng(42) - X = rng.random((100, 10)) - y = rng.random(100) - return X, y - - -class TestGeomNMFInit: - def test_default_params(self): - model = GeomNMF() - assert model.n_components == 10 - assert model.max_iter == 200 - assert model.tol == 1e-4 - assert model.random_state is None - - def test_custom_params(self): - model = GeomNMF(n_components=5, max_iter=100, tol=1e-3, random_state=0) - assert model.n_components == 5 - assert model.max_iter == 100 - assert model.tol == 1e-3 - assert model.random_state == 0 - - def test_get_params(self): - model = GeomNMF(n_components=5) - params = model.get_params() - assert params["n_components"] == 5 - - def test_set_params(self): - model = GeomNMF() - model.set_params(n_components=3) - assert model.n_components == 3 - - -class TestGeomNMFFit: - def test_fit_returns_self(self, sample_data): - X, y = sample_data - model = GeomNMF() - result = model.fit(X, y) - assert result is model - - def test_fit_sets_n_features_in(self, sample_data): - X, y = sample_data - model = GeomNMF().fit(X, y) - assert model.n_features_in_ == X.shape[1] - - def test_fit_sets_is_fitted(self, sample_data): - X, y = sample_data - model = GeomNMF().fit(X, y) - assert model.is_fitted_ - - def test_fit_invalid_input(self): - model = GeomNMF() - with pytest.raises(ValueError): - model.fit([[1, 2], [3, 4]], [1]) # mismatched samples - - -class TestGeomNMFPredict: - def test_predict_raises_before_fit(self, sample_data): - X, _ = sample_data - model = GeomNMF() - with pytest.raises(Exception): - model.predict(X) - - def test_predict_not_implemented(self, sample_data): - X, y = sample_data - model = GeomNMF().fit(X, y) - with pytest.raises(NotImplementedError): - model.predict(X) +def geom_nmf_result(synthetic_data): + Y, _, _ = synthetic_data + return GeomNMF(Y, K=K, seed=SEED, verbose=False) + + +def test_result_attributes(geom_nmf_result): + result = geom_nmf_result + assert hasattr(result, "H_star") + assert hasattr(result, "Phi") + assert hasattr(result, "mu_tilde") + assert hasattr(result, "logvol") + assert hasattr(result, "elapsed") + + +def test_intrinsic_rank(geom_nmf_result): + result = geom_nmf_result + assert result.fit_diagnostics["intrinsic_rank"] == K - 1 + + +def test_endmember_shape(geom_nmf_result): + result = geom_nmf_result + assert result.H_star.shape == (K, J) + + +def test_phi_shape(geom_nmf_result): + result = geom_nmf_result + assert result.Phi.shape == (K, J) + + +def test_alignment_error(synthetic_data, geom_nmf_result): + _, H_true, _ = synthetic_data + result = geom_nmf_result + H_perm, _, _, order = permute_to_reference( + H_true, result.H_star, result.mu_tilde, result.Phi + ) + assert len(order) == K + errors = np.linalg.norm(H_true - H_perm, axis=1) + assert np.all(errors < 0.5), f"Large endmember errors: {errors.round(4)}" + + +def test_barplot_runs(geom_nmf_result): + feature_labels = [f"F{j+1}" for j in range(J)] + barplot_Phi( + geom_nmf_result.Phi, + title="Estimated source attribution (GeomNMF)", + feature_labels=feature_labels, + ) diff --git a/tests/test_viz.py b/tests/test_viz.py deleted file mode 100644 index 16ccd72..0000000 --- a/tests/test_viz.py +++ /dev/null @@ -1,44 +0,0 @@ -import numpy as np -import pytest -from geom_nmf import GeomNMF -from geom_nmf import viz - - -@pytest.fixture -def fitted_model(): - rng = np.random.default_rng(42) - X = rng.random((100, 10)) - y = rng.random(100) - return GeomNMF().fit(X, y) - - -@pytest.fixture -def coords(): - rng = np.random.default_rng(42) - return rng.uniform(low=[-180, -90], high=[180, 90], size=(100, 2)) - - -class TestPlotComponents: - def test_not_implemented(self, fitted_model): - with pytest.raises(NotImplementedError): - viz.plot_components(fitted_model) - - -class TestPlotReconstructionError: - def test_not_implemented(self, fitted_model): - with pytest.raises(NotImplementedError): - viz.plot_reconstruction_error(fitted_model) - - -class TestPlotSpatialMap: - def test_not_implemented(self, fitted_model, coords): - with pytest.raises(NotImplementedError): - viz.plot_spatial_map(fitted_model, coords) - - -class TestPlotPredictedVsActual: - def test_not_implemented(self): - y_true = np.random.default_rng(42).random(100) - y_pred = np.random.default_rng(0).random(100) - with pytest.raises(NotImplementedError): - viz.plot_predicted_vs_actual(y_true, y_pred) From e17964151772c2f7022cde4071558c69cfdee87d Mon Sep 17 00:00:00 2001 From: Bora Jin Date: Fri, 20 Mar 2026 12:56:12 -0400 Subject: [PATCH 2/6] env activate instead of shell for virtual environment Added instructions for activating the poetry environment. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index dc388ae..abc85c1 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,11 @@ To activate the virtual environment: ```sh poetry shell ``` +or + +```sh +poetry env activate +``` Your prompt will change to indicate you're inside the environment. To exit when you're done: From 907a474f37bfa24bcd1e90e77c1676f63beade12 Mon Sep 17 00:00:00 2001 From: jinbora0720 Date: Fri, 20 Mar 2026 14:57:12 -0400 Subject: [PATCH 3/6] redefine n_clusters as minimum --- src/geom_nmf/core.py | 5 +++-- src/geom_nmf/endmembers.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py index b42ee0e..9b0a1c4 100644 --- a/src/geom_nmf/core.py +++ b/src/geom_nmf/core.py @@ -81,7 +81,8 @@ def GeomNMF( * ``"exact"`` – convex-hull vertices via QHull (exact but may be slow in high intrinsic dimension). * ``"random"`` – random projection extremes (fast approximation; - controlled by *n_directions*, *n_top*, *n_candidates*). + controlled by *n_directions*, *n_top*, *n_candidates*). When n >> J, + this can be slower than ``"exact"``. Default ``"exact"``. n_directions : int, optional @@ -98,7 +99,7 @@ def GeomNMF( cluster before the exhaustive volume search. Useful when the candidate pool is large. Default False. n_clusters : int or None, optional - Number of clusters when *prune=True*. Defaults to ``10*K``. + Minimum number of clusters when *prune=True*. Defaults to ``10*K``. refine_greedy : bool, optional If True, run one pass of N-FINDR greedy swaps on the initial solution to try to improve the simplex volume. Only meaningful when diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py index 8bee0d3..2650931 100644 --- a/src/geom_nmf/endmembers.py +++ b/src/geom_nmf/endmembers.py @@ -143,15 +143,15 @@ def prune_close_points( The number of clusters is chosen automatically by maximizing the silhouette score over the range - ``[max(2, n_clusters // 2), min(2 * n_clusters, m - 1)]``, - centered around *n_clusters*. + ``[max(2, n_clusters), min(2 * n_clusters, m - 1)]``, + with *n_clusters* as the lower bound. Parameters ---------- points : np.ndarray, shape (m, J) Points to prune. Must be finite (no NaN or Inf). n_clusters : int, optional - Target number of representatives (clusters) to retain after pruning. The + Minimum number of representatives (clusters) to retain after pruning. The silhouette search explores a range around this value, so the actual number returned may differ slightly. Default 25. seed : int, optional @@ -179,7 +179,7 @@ def prune_close_points( except ImportError: return X, list(range(m)) - lower = max(2, n_clusters // 2) + lower = max(2, n_clusters) upper = min(2 * n_clusters, m - 1) if lower > upper: From 367b6cec5e22f10624ecefa1817cbc90706fe609 Mon Sep 17 00:00:00 2001 From: jinbora0720 Date: Fri, 20 Mar 2026 16:31:36 -0400 Subject: [PATCH 4/6] tighten the search interval for n_clusters and add more explicit dosctring --- src/geom_nmf/core.py | 4 +++- src/geom_nmf/endmembers.py | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py index 9b0a1c4..a731022 100644 --- a/src/geom_nmf/core.py +++ b/src/geom_nmf/core.py @@ -99,7 +99,9 @@ def GeomNMF( cluster before the exhaustive volume search. Useful when the candidate pool is large. Default False. n_clusters : int or None, optional - Minimum number of clusters when *prune=True*. Defaults to ``10*K``. + Target minimum number of clusters when *prune=True*. The actual number + returned may be lower if empty clusters reduce the count. + Defaults to ``10*K``. refine_greedy : bool, optional If True, run one pass of N-FINDR greedy swaps on the initial solution to try to improve the simplex volume. Only meaningful when diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py index 2650931..4decc54 100644 --- a/src/geom_nmf/endmembers.py +++ b/src/geom_nmf/endmembers.py @@ -143,7 +143,7 @@ def prune_close_points( The number of clusters is chosen automatically by maximizing the silhouette score over the range - ``[max(2, n_clusters), min(2 * n_clusters, m - 1)]``, + ``[max(2, n_clusters), min(int(1.5 * n_clusters), m - 1)]``, with *n_clusters* as the lower bound. Parameters @@ -151,9 +151,9 @@ def prune_close_points( points : np.ndarray, shape (m, J) Points to prune. Must be finite (no NaN or Inf). n_clusters : int, optional - Minimum number of representatives (clusters) to retain after pruning. The - silhouette search explores a range around this value, so the actual - number returned may differ slightly. Default 25. + Target minimum number of representatives to retain after pruning. + The silhouette search starts at this value, but the actual number + returned may be lower if empty clusters reduce the count. Default 25. seed : int, optional Random seed passed to MiniBatchKMeans. Default 123. @@ -180,7 +180,7 @@ def prune_close_points( return X, list(range(m)) lower = max(2, n_clusters) - upper = min(2 * n_clusters, m - 1) + upper = min(int(1.5 * n_clusters), m - 1) if lower > upper: return X, list(range(m)) From e06d349b90e7b1621d9aced36a0c0cb2361108df Mon Sep 17 00:00:00 2001 From: jinbora0720 Date: Wed, 25 Mar 2026 23:44:50 -0400 Subject: [PATCH 5/6] delete useless text --- src/geom_nmf/nfindr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py index 8ba7d79..dc07f45 100644 --- a/src/geom_nmf/nfindr.py +++ b/src/geom_nmf/nfindr.py @@ -7,7 +7,7 @@ Reference --------- Winter, M. E. (1999). N-FINDR: An algorithm for fast autonomous spectral -end-member determination in hyperspectral data. *Proc. SPIE 3753*. +end-member determination in hyperspectral data. """ import math From 54074f5ea0b6bba58f800664dbe293623b291f5a Mon Sep 17 00:00:00 2001 From: jinbora0720 Date: Wed, 25 Mar 2026 23:52:32 -0400 Subject: [PATCH 6/6] fixed ATGP reference --- src/geom_nmf/nfindr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py index dc07f45..9076a06 100644 --- a/src/geom_nmf/nfindr.py +++ b/src/geom_nmf/nfindr.py @@ -205,7 +205,7 @@ def nfindr( ----- Deviations from Winter (1999): - * Defaults to ATGP initialization (Plaza et al., 2002) instead of + * Defaults to ATGP initialization (Plaza and Chang, 2006) instead of random; pass ``init="random"`` to recover the original behavior. * Optional L2 row-normalization before PCA (``normalize=True`` by default); Winter (1999) does not normalize.