From 596afa4eddadc1db201eeccaa691d7e1e9d54463 Mon Sep 17 00:00:00 2001
From: jinbora0720 <jinbora0720@gmail.com>
Date: Fri, 20 Mar 2026 12:53:49 -0400
Subject: [PATCH 1/6] first publish

---
 src/geom_nmf/__init__.py   |  36 +++-
 src/geom_nmf/_base.py      |  65 --------
 src/geom_nmf/_viz.py       |  77 ---------
 src/geom_nmf/core.py       | 318 ++++++++++++++++++++++++++++++++++++
 src/geom_nmf/endmembers.py | 327 +++++++++++++++++++++++++++++++++++++
 src/geom_nmf/nfindr.py     | 319 ++++++++++++++++++++++++++++++++++++
 src/geom_nmf/plot.py       | 128 +++++++++++++++
 src/geom_nmf/utils.py      |  58 +++++++
 src/geom_nmf/weights.py    | 146 +++++++++++++++++
 tests/test_basics.py       |   5 -
 tests/test_model.py        | 135 ++++++++-------
 tests/test_viz.py          |  44 -----
 12 files changed, 1394 insertions(+), 264 deletions(-)
 delete mode 100644 src/geom_nmf/_base.py
 delete mode 100644 src/geom_nmf/_viz.py
 create mode 100644 src/geom_nmf/core.py
 create mode 100644 src/geom_nmf/endmembers.py
 create mode 100644 src/geom_nmf/nfindr.py
 create mode 100644 src/geom_nmf/plot.py
 create mode 100644 src/geom_nmf/utils.py
 create mode 100644 src/geom_nmf/weights.py
 delete mode 100644 tests/test_basics.py
 delete mode 100644 tests/test_viz.py

diff --git a/src/geom_nmf/__init__.py b/src/geom_nmf/__init__.py
index 7c16065..0fad239 100644
--- a/src/geom_nmf/__init__.py
+++ b/src/geom_nmf/__init__.py
@@ -1,7 +1,35 @@
-from importlib.metadata import version
+"""
+geom_nmf
+========
+Geometric NMF: geometric non-negative matrix factorization via maximum-volume simplex fitting.
+
+Public API
+----------
+GeomNMF                      Main pipeline.
+nfindr                       N-FINDR endmember extraction (standalone).
+log_simplex_volume           Log-volume of a simplex spanned by K points.
+compute_Phi                  Source attribution matrix from source means and H.
+estimate_weights             Recover W from Y and H.
+permute_to_reference         Permute estimated H, W, Phi to best match a reference ordering.
+barplot_Phi                  Grouped bar plot of Phi (% attribution per source and feature).
+"""
 
-from ._base import GeomNMF
-from . import _viz as viz
+from importlib.metadata import version
+from .core import GeomNMF, GeomNMFResult
+from .nfindr import nfindr
+from .endmembers import log_simplex_volume
+from .weights import compute_Phi, estimate_weights
+from .utils import permute_to_reference
+from .plot import barplot_Phi
 
 __version__ = version("geom-nmf")
-__all__ = ["GeomNMF", "viz"]
+__all__ = [
+    "GeomNMF",
+    "GeomNMFResult",
+    "nfindr",
+    "log_simplex_volume",
+    "compute_Phi",
+    "estimate_weights",
+    "permute_to_reference",
+    "barplot_Phi"
+]
diff --git a/src/geom_nmf/_base.py b/src/geom_nmf/_base.py
deleted file mode 100644
index 55bc15f..0000000
--- a/src/geom_nmf/_base.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import numpy as np
-from sklearn.base import BaseEstimator, RegressorMixin
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
-
-
-class GeomNMF(BaseEstimator, RegressorMixin):
-    """Geospatial Non-negative Matrix Factorization regressor.
-
-    Parameters
-    ----------
-    n_components : int, default=10
-        Number of latent components (NMF rank).
-    max_iter : int, default=200
-        Maximum number of iterations.
-    tol : float, default=1e-4
-        Convergence tolerance.
-    random_state : int or None, default=None
-        Random seed for reproducibility.
-    """
-
-    def __init__(self, n_components=10, max_iter=200, tol=1e-4, random_state=None):
-        self.n_components = n_components
-        self.max_iter = max_iter
-        self.tol = tol
-        self.random_state = random_state
-
-    def fit(self, X, y):
-        """Fit the model to training data.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Training input samples.
-        y : array-like of shape (n_samples,)
-            Target values.
-
-        Returns
-        -------
-        self : GeomNMF
-            Fitted estimator.
-        """
-        X, y = check_X_y(X, y)
-
-        self.n_features_in_ = X.shape[1]
-        self.is_fitted_ = True
-
-        return self
-
-    def predict(self, X):
-        """Predict target values for input samples.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Input samples.
-
-        Returns
-        -------
-        y_pred : ndarray of shape (n_samples,)
-            Predicted values.
-        """
-        check_is_fitted(self)
-        X = check_array(X)
-
-        raise NotImplementedError("predict() is not yet implemented.")
diff --git a/src/geom_nmf/_viz.py b/src/geom_nmf/_viz.py
deleted file mode 100644
index e9c6e08..0000000
--- a/src/geom_nmf/_viz.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import numpy as np
-
-
-def plot_components(model, feature_names=None, ax=None):
-    """Plot the learned NMF components (basis vectors).
-
-    Parameters
-    ----------
-    model : GeomNMF
-        A fitted GeomNMF instance.
-    feature_names : list of str, optional
-        Names for each feature/column.
-    ax : matplotlib.axes.Axes, optional
-        Axes to plot on. Creates a new figure if not provided.
-
-    Returns
-    -------
-    ax : matplotlib.axes.Axes
-    """
-    raise NotImplementedError
-
-
-def plot_reconstruction_error(model, ax=None):
-    """Plot reconstruction error over iterations.
-
-    Parameters
-    ----------
-    model : GeomNMF
-        A fitted GeomNMF instance.
-    ax : matplotlib.axes.Axes, optional
-        Axes to plot on. Creates a new figure if not provided.
-
-    Returns
-    -------
-    ax : matplotlib.axes.Axes
-    """
-    raise NotImplementedError
-
-
-def plot_spatial_map(model, coords, component=0, ax=None):
-    """Plot the spatial distribution of a single NMF component.
-
-    Parameters
-    ----------
-    model : GeomNMF
-        A fitted GeomNMF instance.
-    coords : array-like of shape (n_samples, 2)
-        Longitude/latitude coordinates for each sample.
-    component : int, default=0
-        Index of the component to plot.
-    ax : matplotlib.axes.Axes, optional
-        Axes to plot on. Creates a new figure if not provided.
-
-    Returns
-    -------
-    ax : matplotlib.axes.Axes
-    """
-    raise NotImplementedError
-
-
-def plot_predicted_vs_actual(y_true, y_pred, ax=None):
-    """Scatter plot of predicted vs. actual target values.
-
-    Parameters
-    ----------
-    y_true : array-like of shape (n_samples,)
-        True target values.
-    y_pred : array-like of shape (n_samples,)
-        Predicted target values.
-    ax : matplotlib.axes.Axes, optional
-        Axes to plot on. Creates a new figure if not provided.
-
-    Returns
-    -------
-    ax : matplotlib.axes.Axes
-    """
-    raise NotImplementedError
diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py
new file mode 100644
index 0000000..b42ee0e
--- /dev/null
+++ b/src/geom_nmf/core.py
@@ -0,0 +1,318 @@
+"""
+geom_nmf.core
+=============
+Main GeomNMF pipeline: maximum-volume simplex fitting for geometric NMF.
+"""
+
+import time
+from typing import NamedTuple
+import numpy as np
+import pandas as pd
+
+from geom_nmf.endmembers import (
+    get_hull_candidates,
+    get_random_direction_candidates,
+    prune_close_points,
+    estimate_H, 
+    log_simplex_volume
+)
+from geom_nmf.weights import estimate_weights, compute_Phi
+from geom_nmf.nfindr import nfindr
+
+
+class GeomNMFResult(NamedTuple):
+    """Return type of :func:`GeomNMF`."""
+    H_star:             np.ndarray  # (K, J) estimated endmembers (row-normalized)
+    W_tilde:            np.ndarray  # (n, K) mixing weights scaled to original row sums
+    mu_tilde:           np.ndarray  # (K,)   mean weight per source
+    Phi:                np.ndarray  # (K, J) source attribution matrix
+    logvol:             float       # log-volume of the selected simplex
+    weight_diagnostics: dict        # diagnostics from weight recovery (see estimate_weights)
+    fit_diagnostics:    dict        # diagnostics from geometry / candidate steps
+    elapsed:            float       # total wall-clock run time in seconds
+
+
+def GeomNMF(
+    Y: np.ndarray | pd.DataFrame,
+    K: int,
+    seed: int = 123,
+    rank_tol: float = 1e-12,
+    weight_tol: float = 1e-2,
+    candidate_method: str = "exact",
+    n_directions: int = 20000,
+    n_top: int = 1,
+    n_candidates: int | None = None,
+    prune: bool = False,
+    n_clusters: int | None = None,
+    refine_greedy: bool = False,
+    verbose: bool = False,
+) -> GeomNMFResult:
+    """
+    Geometric non-negative matrix factorization via maximum-volume simplex fitting (GeomNMF).
+
+    Assumes the observed data matrix Y follows a simplex mixing model::
+
+        Y ≈ W H,  W ≥ 0,  H ≥ 0,  H 1 = 1
+
+    and estimates H (endmembers) by finding the maximum-volume K-simplex
+    inscribed in the convex hull of the row-normalized data Y_star.
+
+    Parameters
+    ----------
+    Y : np.ndarray or pd.DataFrame, shape (n, J)
+        Observed data matrix.  Rows are observations; columns are features.
+        All entries must be nonnegative (the method uses row-normalization).
+    K : int
+        Number of sources / endmembers to estimate.
+    seed : int, optional
+        Random seed (used for random candidate method and greedy refinement).
+        Default 123.
+    rank_tol : float, optional
+        Singular-value threshold for intrinsic rank determination; values below
+        this are treated as zero.  Default 1e-12.
+    weight_tol : float, optional
+        Threshold for flagging simplex constraint violations in
+        ``weight_diagnostics``; rows where any weight is below ``-weight_tol``
+        or the row sum deviates from 1 by more than ``weight_tol`` are flagged.
+        Default 1e-2.
+    candidate_method : {"exact", "random"}, optional
+        How to generate the pool of endmember candidates:
+
+        * ``"exact"`` – convex-hull vertices via QHull (exact but may be
+          slow in high intrinsic dimension).
+        * ``"random"`` – random projection extremes (fast approximation;
+          controlled by *n_directions*, *n_top*, *n_candidates*).
+
+        Default ``"exact"``.
+    n_directions : int, optional
+        Number of random directions for ``candidate_method="random"``.
+        Default 20000.
+    n_top : int, optional
+        Number of extreme points recorded per random direction (both ends)
+        when ``candidate_method="random"``.  Default 1.
+    n_candidates : int or None, optional
+        Number of candidates to keep after random direction sampling
+        (highest-frequency first).  Default None (keep all that appear).
+    prune : bool, optional
+        If True, cluster the candidates and keep one representative per
+        cluster before the exhaustive volume search.  Useful when the
+        candidate pool is large.  Default False.
+    n_clusters : int or None, optional
+        Number of clusters when *prune=True*.  Defaults to ``10*K``.
+    refine_greedy : bool, optional
+        If True, run one pass of N-FINDR greedy swaps on the initial
+        solution to try to improve the simplex volume.  Only meaningful when
+        ``candidate_method="random"`` or ``prune=True`` (with ``"exact"``
+        and no pruning the exhaustive search is already globally optimal over
+        the hull vertices).  Default False.
+    verbose : bool, optional
+        Print progress information.  Default False.
+
+    Raises
+    ------
+    ValueError
+        If *Y* contains any NaN or Inf values, any negative values, any
+        all-zero rows, or if *candidate_method* is not ``"exact"`` or
+        ``"random"``.
+
+    Returns
+    -------
+    result : GeomNMFResult
+        A named tuple with the following fields:
+
+        * ``H_star`` – np.ndarray, shape (K, J): estimated endmembers
+          (rows of row-normalized Y_star).
+        * ``W_tilde`` – np.ndarray, shape (n, K): estimated mixing weights
+          scaled back to the original row-sum magnitudes.
+        * ``mu_tilde`` – np.ndarray, shape (K,): mean weight per source.
+        * ``Phi`` – np.ndarray, shape (K, J): source attribution matrix
+          (see :func:`geom_nmf.weights.compute_Phi`).
+        * ``logvol`` – float: log-volume of the selected simplex.
+        * ``weight_diagnostics`` – dict: diagnostics from the weight recovery step
+          (see :func:`geom_nmf.weights.estimate_weights`), with keys:
+
+          - ``"max_row_sum_dev_H"`` – max absolute deviation of H row sums from 1;
+            should be near zero for a well-formed endmember matrix.
+          - ``"rank_H_aug"`` – rank of the augmented system ``[H | 1]``; should
+            equal K for an identifiable model.
+          - ``"cond_G"`` – condition number of ``H_aug @ H_aug.T``; large values
+            indicate near-collinear endmembers and amplified weight errors.
+          - ``"I_err"`` – ``‖H_aug pinv(H_aug) − I‖_∞``; near zero means the
+            pseudoinverse is numerically accurate.
+          - ``"aug_resid_inf"`` – ``‖Y_aug − W_raw H_aug‖_∞``; reconstruction
+            error of the raw (pre-clipping) weights.
+          - ``"large_neg_rows"`` – row indices of observations where the raw
+            weight vector has a component below ``−weight_tol``; many such rows
+            suggest model misspecification.
+          - ``"large_neg_count"`` – number of rows in ``large_neg_rows``.
+          - ``"not_sum1_rows"`` – row indices where the raw weight row-sum
+            deviates from 1 by more than ``weight_tol``.
+          - ``"not_sum1_count"`` – number of rows in ``not_sum1_rows``.
+        * ``fit_diagnostics`` – dict: diagnostics from the geometry / candidate
+          steps, with keys:
+
+          - ``"intrinsic_rank"`` – int: effective affine dimension of the data
+            (number of singular values above *rank_tol*); should equal K-1 for a
+            well-posed model.
+          - ``"n_cand"`` – int: number of candidate points passed to the
+            exhaustive max-volume search (after optional pruning).
+          - ``"cand_idx"`` – np.ndarray: indices (into the input *Y*) of the
+            candidate extreme points identified in Step 1.
+          - ``"direction_hit_counts"`` – np.ndarray or None: per-observation selection
+            frequency from random-direction sampling; None when
+            ``candidate_method="exact"``.
+          - ``"logvol_before_refinement"`` – float or None: log-volume of the exhaustive
+            search solution before greedy refinement; None when
+            ``refine_greedy=False`` or refinement was skipped.
+        * ``elapsed`` – float: total wall-clock run time in seconds.
+    """
+    t_start = time.time()
+
+    if isinstance(Y, pd.DataFrame):
+        Y = Y.to_numpy()
+    Y = np.asarray(Y, dtype=float)
+
+    # check bad data
+    if not np.isfinite(Y).all():
+        bad = np.argwhere(~np.isfinite(Y))
+        raise ValueError(
+            f"Y contains {len(bad)} non-finite value(s) (NaN or Inf). "
+            "Remove or impute incomplete rows before calling GeomNMF."
+        )
+    if (Y < 0).any():
+        bad = np.argwhere(Y < 0)
+        raise ValueError(
+            f"Y contains {len(bad)} negative value(s). "
+            "GeomNMF requires a nonnegative data matrix."
+        )
+    if (Y.sum(axis=1) == 0).any():
+        bad_rows = np.flatnonzero(Y.sum(axis=1) == 0)
+        raise ValueError(
+            f"Y has {len(bad_rows)} all-zero row(s) (indices: {bad_rows.tolist()}). "
+            "Row normalization requires each row to have a positive sum."
+        )
+
+    # Row sums and normalized data
+    n = Y.shape[0]
+    r = Y.sum(axis=1, keepdims=True)
+    Y_star = Y / r                           # (n, J)
+
+    # --- Step 0: project to intrinsic affine subspace ---
+    Y_star_reduced = Y_star[:, :-1].astype(float, copy=False)   # (n, J-1)
+    mean = Y_star_reduced.mean(axis=0, keepdims=True)
+    Yc = Y_star_reduced - mean
+    _, S_svd, Vt = np.linalg.svd(Yc, full_matrices=False)
+    mask = S_svd > rank_tol
+    basis = Vt[mask].T                       # (J-1, rank)
+    Yc_proj = Yc @ basis                     # (n, rank)
+
+    # --- Step 1: build candidate pool ---
+    counts = None  # only populated for candidate_method="random"
+    if candidate_method == "exact":
+        cand_idx = get_hull_candidates(Yc_proj, verbose=verbose)
+        cand_proj = Yc_proj[cand_idx]
+
+    elif candidate_method == "random":
+        # Whiten (scale to unit covariance -> remove directional stretch and make all directions comparable) 
+        # intrinsic coordinates before random projection
+        C_cov = (Yc_proj.T @ Yc_proj) / max(n - 1, 1)
+        evals, evecs = np.linalg.eigh(C_cov)
+        evals = np.maximum(evals, rank_tol)
+        inv_sqrt_cov = evecs @ np.diag(1.0 / np.sqrt(evals)) @ evecs.T
+        Yc_proj_w = Yc_proj @ inv_sqrt_cov
+        cand_idx, counts = get_random_direction_candidates(
+            Yc_proj_w, seed=seed, n_directions=n_directions, n_top=n_top, n_candidates=n_candidates, verbose=verbose,
+        )
+        cand_proj = Yc_proj[cand_idx]        # un-whitened: pruning clusters in the same geometry as the volume objective
+
+    else:
+        raise ValueError("candidate_method must be 'exact' or 'random'.")
+
+    cand_ambient = Y_star[cand_idx]          # (m, J)
+
+    # --- Optional: cluster-based pruning ---
+    if prune:
+        _n_clusters = 10 * K if n_clusters is None else n_clusters
+        if _n_clusters >= len(cand_proj):
+            if verbose:
+                print(
+                    f"Pruning skipped: candidate count {len(cand_proj)} ≤ n_clusters {_n_clusters}"
+                )
+        else:
+            _, pruned_idx_local = prune_close_points(cand_proj, n_clusters=_n_clusters, seed=seed)
+            cand_ambient = cand_ambient[pruned_idx_local]
+            cand_idx = cand_idx[pruned_idx_local]
+            if verbose:
+                print(f"Number of pruned candidates: {len(pruned_idx_local)}")
+
+    H_candidates = cand_ambient
+
+    # --- Step 2: exhaustive max-volume search over candidates ---
+    H_star_hat, logvol_hat = estimate_H(
+        H_candidates, K, verbose=verbose
+    )
+
+    # --- Optional: N-FINDR greedy refinement ---
+    logvol_before = None
+    if refine_greedy:
+        if candidate_method == "exact" and not prune:
+            if verbose:
+                print(
+                    "Greedy refinement skipped: exact hull without pruning is "
+                    "already globally optimal over hull vertices."
+                )
+        else:
+            logvol_before = logvol_hat
+            diffs = Y_star[:, np.newaxis, :] - H_star_hat[np.newaxis, :, :] # (n, K, J)
+            init_idx = np.argmin((diffs ** 2).sum(axis=2), axis=0) # (K,)
+
+            _, refined_idx = nfindr(
+                Y_star, K, normalize=False, init_idx=init_idx, seed=seed,
+            )
+            H_refined = Y_star[refined_idx]
+            logvol_refined, _ = log_simplex_volume(H_refined, rank_tol=rank_tol)
+
+            if logvol_refined > logvol_before:
+                H_star_hat = H_refined
+                logvol_hat = logvol_refined
+                if verbose:
+                    vol_ratio = np.exp(logvol_refined - logvol_before)
+                    print(
+                        f"Greedy refinement accepted; log-vol "
+                        f"{logvol_before:.4f} → {logvol_refined:.4f} "
+                        f"(volume ratio: {vol_ratio:.3f}x)"
+                    )
+            else:
+                if verbose:
+                    print(
+                        f"Greedy refinement rejected; refined log-vol "
+                        f"{logvol_refined:.4f} did not improve over "
+                        f"{logvol_before:.4f}"
+                    )
+
+    # --- Step 3: recover mixing weights ---
+    W_star_hat, weight_diagnostics = estimate_weights(Y_star, H_star_hat, weight_tol=weight_tol, verbose=verbose)
+    W_tilde_hat = W_star_hat * r
+    mu_tilde_hat = W_tilde_hat.mean(axis=0)
+    Phi_hat = compute_Phi(mu_tilde_hat, H_star_hat)
+
+    elapsed = time.time() - t_start
+
+    fit_diagnostics = {
+        "intrinsic_rank": int(mask.sum()),
+        "n_cand": len(H_candidates),
+        "cand_idx": cand_idx,
+        "direction_hit_counts": counts,                # np.ndarray if candidate_method="random", else None
+        "logvol_before_refinement": logvol_before,     # float if refine_greedy ran, else None
+    }
+
+    return GeomNMFResult(
+        H_star=H_star_hat,
+        W_tilde=W_tilde_hat,
+        mu_tilde=mu_tilde_hat,
+        Phi=Phi_hat,
+        logvol=logvol_hat,
+        weight_diagnostics=weight_diagnostics,
+        fit_diagnostics=fit_diagnostics,
+        elapsed=elapsed,
+    )
diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py
new file mode 100644
index 0000000..8bee0d3
--- /dev/null
+++ b/src/geom_nmf/endmembers.py
@@ -0,0 +1,327 @@
+"""
+geom_nmf.endmembers
+===================
+Candidate endmember generation: convex-hull vertices (exact) or random
+projection extremes (approximate), plus clustering-based pruning.
+Volume scoring and exhaustive maximum-volume simplex search over a candidate set of endmember points.
+"""
+
+import time
+import numpy as np
+from itertools import combinations
+from scipy.special import gammaln, comb
+
+def get_hull_candidates(
+    Yc_proj: np.ndarray,
+    verbose: bool = False,
+) -> np.ndarray:
+    """
+    Return convex-hull vertex indices of *Yc_proj* via QHull.
+
+    Parameters
+    ----------
+    Yc_proj : np.ndarray, shape (n, r)
+        Centred, intrinsic-space projections of the data.
+    verbose : bool, optional
+        Print timing and candidate count.  Default False.
+
+    Returns
+    -------
+    cand_idx : np.ndarray, shape (m,)
+        Indices (into *Yc_proj*) of the convex-hull vertices.
+    """
+    from scipy.spatial import ConvexHull
+
+    if verbose:
+        print("Computing convex hull...", end="", flush=True)
+
+    start = time.time()
+    hull = ConvexHull(Yc_proj, qhull_options="Qx Qt Q12 Pp")
+
+    if verbose:
+        print(f" done in {time.time() - start:.2f}s; #cands={len(hull.vertices)}")
+
+    return hull.vertices
+
+
+def get_random_direction_candidates(
+    Yc_proj_w: np.ndarray,
+    seed: int = 123,
+    n_directions: int = 20000,
+    n_top: int = 1,
+    n_candidates: int | None = None,
+    verbose: bool = False,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Find extreme points by projecting onto many random directions (vectorized).
+
+    For each random unit vector u the *n_top* largest **and** smallest dot
+    products ``Yc_proj_w @ u`` are recorded.  Points that appear as extremes
+    most often are the most likely simplex vertices.
+
+    Parameters
+    ----------
+    Yc_proj_w : np.ndarray, shape (n, r)
+        Whitened, centered projections of the data in intrinsic space.
+    seed : int, optional
+        Random seed for reproducibility.  Default 123.
+    n_directions : int, optional
+        Number of random directions to sample.  Default 20000.
+    n_top : int, optional
+        How many extreme points to record per direction (both ends).
+        Default 1.
+    n_candidates : int or None, optional
+        If given, return at most this many candidate indices (highest counts
+        first).  Default None (return all that were ever selected).
+    verbose : bool, optional
+        Print timing and candidate count.  Default False.
+
+    Returns
+    -------
+    chosen_sorted : np.ndarray, shape (m,)
+        Indices (into *Yc_proj_w*) of candidate extreme points, sorted by
+        descending selection frequency.
+    counts : np.ndarray, shape (n,)
+        Full frequency array for all n points.
+    """
+    X = np.asarray(Yc_proj_w, dtype=float)
+    n, r = X.shape
+    rng = np.random.default_rng(seed)
+    counts = np.zeros(n, dtype=np.intp)
+
+    if verbose:
+        print(
+            f"Sampling {n_top} extreme points in each of {n_directions} random directions...",
+            end="",
+            flush=True,
+        )
+    start = time.time()
+
+    batch = min(n_directions, 2048)
+    remaining = n_directions
+    while remaining > 0:
+        b = min(batch, remaining)
+        remaining -= b
+
+        U = rng.standard_normal((r, b))
+        U /= np.linalg.norm(U, axis=0, keepdims=True) + 1e-12
+
+        S = X @ U   # (n, b)
+
+        if n_top == 1:
+            np.add.at(counts, S.argmax(axis=0), 1)
+            np.add.at(counts, S.argmin(axis=0), 1)
+        else:
+            St = S.T  # (b, n)
+            hi_idx = np.argpartition(St, -n_top, axis=1)[:, -n_top:]
+            lo_idx = np.argpartition(St,  n_top, axis=1)[:,  :n_top]
+            np.add.at(counts, hi_idx.ravel(), 1)
+            np.add.at(counts, lo_idx.ravel(), 1)
+
+    chosen = np.flatnonzero(counts > 0)
+    chosen_sorted = chosen[np.argsort(counts[chosen])[::-1]] # min(n, 2n_directions x n_top) candidates, sorted by count
+
+    if n_candidates is not None:
+        chosen_sorted = chosen_sorted[:n_candidates] # min(n, 2n_directions x n_top, n_candidates) candidates
+
+    if verbose:
+        print(f" done in {time.time() - start:.2f}s; #cands={len(chosen_sorted)}")
+
+    return chosen_sorted, counts
+
+def prune_close_points(
+    points: np.ndarray,
+    n_clusters: int = 25,
+    seed: int = 123,
+) -> tuple[np.ndarray, list[int]]:
+    """
+    Cluster *points* and return one representative per cluster.
+ 
+    The representative is the cluster member closest (in Euclidean distance)
+    to the cluster centroid, so the returned points are always actual data
+    points (not interpolated centroids).
+ 
+    The number of clusters is chosen automatically by maximizing the
+    silhouette score over the range
+    ``[max(2, n_clusters // 2), min(2 * n_clusters, m - 1)]``,
+    centered around *n_clusters*.
+ 
+    Parameters
+    ----------
+    points : np.ndarray, shape (m, J)
+        Points to prune.  Must be finite (no NaN or Inf).
+    n_clusters : int, optional
+        Target number of representatives (clusters) to retain after pruning.  The
+        silhouette search explores a range around this value, so the actual
+        number returned may differ slightly.  Default 25.
+    seed : int, optional
+        Random seed passed to MiniBatchKMeans.  Default 123.
+ 
+    Returns
+    -------
+    representatives : np.ndarray, shape (n_clusters_out, J)
+        Selected representative points.
+    selected_indices : list of int
+        Row indices into *points* for each representative.
+    """
+    X = np.asarray(points, dtype=float)
+    m = X.shape[0]
+ 
+    if m == 0:
+        return X, []
+    if m == 1:
+        return X.copy(), [0]
+ 
+    # try to import sklearn lazily; if missing, skip pruning safely.
+    try:
+        from sklearn.cluster import MiniBatchKMeans
+        from sklearn.metrics import silhouette_score
+    except ImportError:
+        return X, list(range(m))
+ 
+    lower = max(2, n_clusters // 2)
+    upper = min(2 * n_clusters, m - 1)
+ 
+    if lower > upper:
+        return X, list(range(m))
+ 
+    best_k, best_s = lower, -np.inf
+    for k in range(lower, upper + 1):
+        km = MiniBatchKMeans(n_clusters=k, random_state=seed, n_init=10, batch_size=4096)
+        labels = km.fit_predict(X)
+        if len(np.unique(labels)) < 2:
+            continue
+        s = silhouette_score(
+            X, labels,
+            metric="euclidean",
+            sample_size=min(10000, m),
+            random_state=seed,
+        )
+        if s > best_s:
+            best_s, best_k = s, k
+ 
+    km = MiniBatchKMeans(n_clusters=best_k, random_state=seed, n_init=10, batch_size=4096)
+    labels = km.fit_predict(X)
+    centers = km.cluster_centers_
+ 
+    selected = []
+    for k in range(best_k):
+        members = np.flatnonzero(labels == k)
+        if len(members) == 0:
+            continue
+        diffs = X[members] - centers[k]
+        nearest = members[np.einsum("ij,ij->i", diffs, diffs).argmin()]
+        selected.append(int(nearest))
+ 
+    selected = sorted(set(selected))
+    return X[selected], selected
+
+
+def log_simplex_volume(
+    subset: np.ndarray,
+    rank_tol: float = 1e-12,
+) -> tuple[float, int]:
+    """
+    Compute the log-volume of the (K-1)-simplex spanned by *subset*.
+
+    The volume is defined as::
+
+        vol = (∏ sᵢ) / r!
+
+    where sᵢ are the nonzero singular values of the edge matrix
+    ``subset[1:] - subset[0]`` and r is the affine rank.
+
+    Parameters
+    ----------
+    subset : np.ndarray, shape (K, J)
+        K points in J-dimensional ambient space.
+    rank_tol : float, optional
+        Singular-value threshold below which a dimension is considered zero.
+        Default 1e-12.
+
+    Returns
+    -------
+    log_vol : float
+        Natural log of the simplex volume, or ``-np.inf`` if degenerate.
+    r : int
+        Affine rank of *subset* (number of non-negligible singular values).
+    """
+    base = subset[0]
+    A = subset[1:] - base              # edge matrix, shape (K-1, J)
+    _, S, _ = np.linalg.svd(A, full_matrices=False)
+
+    mask = S > rank_tol
+    r = int(mask.sum())
+
+    if r < A.shape[0]:                 # degenerate simplex
+        return -np.inf, r
+
+    log_vol = float(np.log(S[mask]).sum() - gammaln(r + 1))
+    return log_vol, r
+
+
+def estimate_H(
+    hull_pts: np.ndarray,
+    K: int,
+    verbose: bool = False,
+) -> tuple[np.ndarray, float]:
+    """
+    Find the K-point subset of *hull_pts* that maximizes simplex log-volume.
+
+    Exhaustively iterates over all C(m, K) combinations.  Feasible when
+    the number of hull / candidate points is small (≲ a few hundred).
+
+    Parameters
+    ----------
+    hull_pts : np.ndarray, shape (m, J)
+        Candidate points (convex-hull vertices or random-direction extremes)
+        in the ambient endmember space.
+    K : int
+        Number of endmembers (simplex vertices) to select.
+    verbose : bool, optional
+        If True, prints the number of combinations and shows a tqdm progress
+        bar (requires tqdm to be installed).  Default False.
+
+    Returns
+    -------
+    H_hat_best : np.ndarray, shape (K, J)
+        The K rows of *hull_pts* that achieve the maximum log-volume.
+    best_logvol : float
+        Corresponding log-volume value.
+
+    Raises
+    ------
+    RuntimeError
+        If every K-subset of *hull_pts* forms a degenerate simplex (zero
+        volume), e.g. when all candidates are collinear.
+    """
+    m = int(hull_pts.shape[0])
+    total = int(comb(m, K, exact=True))
+    if verbose:
+        print(f"H candidates={m}, K={K}, combinations={total:,}")
+
+    it = combinations(range(m), K)
+    if verbose:
+        try:
+            from tqdm import tqdm as _tqdm
+            it = _tqdm(it, total=total, desc=f"Searching K={K} subsets for max volume")
+        except ImportError:
+            pass
+
+    best_logvol = -np.inf
+    best_inds: tuple | None = None
+
+    for inds in it:
+        logvol, _ = log_simplex_volume(hull_pts[list(inds)])
+        if logvol > best_logvol:
+            best_logvol = logvol
+            best_inds = inds
+
+    if best_inds is None:
+        raise RuntimeError(
+            f"No non-degenerate K={K}-simplex found among the {m} candidate points. "
+            "The candidate set may be collinear or too small."
+        )
+
+    H_hat_best = hull_pts[list(best_inds)]
+    return H_hat_best, float(best_logvol)
diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py
new file mode 100644
index 0000000..8ba7d79
--- /dev/null
+++ b/src/geom_nmf/nfindr.py
@@ -0,0 +1,319 @@
+"""
+geom_nmf.nfindr
+===============
+N-FINDR endmember extraction (NumPy-only), based on Winter (1999) with
+documented modifications.
+
+Reference
+---------
+Winter, M. E. (1999). N-FINDR: An algorithm for fast autonomous spectral
+end-member determination in hyperspectral data.  *Proc. SPIE 3753*.
+"""
+
+import math
+import numpy as np
+import pandas as pd
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+def _pca_reduce(
+    Y: np.ndarray,
+    d: int,
+) -> np.ndarray:
+    """
+    Center *Y* and project onto its top *d* principal components.
+
+    Parameters
+    ----------
+    Y : np.ndarray, shape (n, J)
+        Input data matrix.
+    d : int
+        Number of principal components to retain.
+
+    Returns
+    -------
+    Z : np.ndarray, shape (n, d)
+        Projected coordinates.
+    """
+    Yc = Y - Y.mean(axis=0, keepdims=True)
+    U, S, _ = np.linalg.svd(Yc, full_matrices=False)
+    return U[:, :d] * S[:d]
+
+def _build_simplex_matrix(
+    Z: np.ndarray,
+    idx: np.ndarray,
+    K: int,
+) -> tuple[np.ndarray, float]:
+    """
+    Build the augmented simplex matrix M and compute its volume.
+
+    Parameters
+    ----------
+    Z : np.ndarray, shape (n, K-1)
+        Data in the reduced PCA space.
+    idx : np.ndarray, shape (K,)
+        Indices of the K simplex vertices into Z.
+    K : int
+        Number of vertices.
+
+    Returns
+    -------
+    M : np.ndarray, shape (K, K)
+        Augmented matrix [[Z[idx].T], [1...1]].
+    vol : float
+        Simplex volume |det(M)| / (K-1)!
+    """
+    M = np.vstack([Z[idx].T, np.ones((1, K))])
+    vol = abs(float(np.linalg.det(M))) / math.factorial(K - 1)
+    return M, vol
+
+def _init_indices_random(
+    n: int,
+    K: int,
+    rng: np.random.Generator,
+) -> np.ndarray:
+    """
+    Draw *K* distinct indices uniformly at random.
+
+    Parameters
+    ----------
+    n : int
+        Total number of samples.
+    K : int
+        Number of indices to draw.
+    rng : np.random.Generator
+        NumPy random generator instance.
+
+    Returns
+    -------
+    idx : np.ndarray, shape (K,), dtype int
+        Selected indices (no duplicates).
+    """
+    return rng.choice(n, size=K, replace=False)
+
+
+def _init_indices_atgp(
+    Z: np.ndarray,
+    K: int,
+) -> np.ndarray:
+    """
+    ATGP-style greedy initialization in the reduced PCA space.
+
+    Iteratively adds the point whose residual (after projecting out the
+    subspace spanned by all current selections) has the largest norm.
+
+    Parameters
+    ----------
+    Z : np.ndarray, shape (n, d)
+        Data in the reduced (K-1)-dimensional PCA space.
+    K : int
+        Number of endmembers to initialise.
+
+    Returns
+    -------
+    idx : np.ndarray, shape (K,), dtype int
+        Indices of the K initialization points.
+    """
+    idx: list[int] = []
+    norms = np.einsum("ij,ij->i", Z, Z)
+    idx.append(int(np.argmax(norms)))
+
+    for _ in range(1, K):
+        # Q: (d, m=len(idx)) orthonormal basis for the row space of Z[idx]
+        Q, _ = np.linalg.qr(Z[idx].T)
+        # project each row of Z onto the subspace spanned by current selections
+        proj = Z @ (Q @ Q.T)
+        resid_sq = np.einsum("ij,ij->i", Z - proj, Z - proj)
+        resid_sq[idx] = -np.inf
+        # add the point with the largest orthogonal residual (most unexplained)
+        idx.append(int(np.argmax(resid_sq)))
+
+    return np.array(idx, dtype=int)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def nfindr(
+    Y: np.ndarray | pd.DataFrame,
+    K: int,
+    max_iter: int = 5,
+    seed: int | None = None,
+    normalize: bool = True,
+    init: str = "atgp",
+    init_idx: np.ndarray | None = None,
+) -> tuple[np.ndarray, np.ndarray]:
+    """
+    N-FINDR endmember extraction (NumPy-only, no SciPy).
+
+    Finds K endmembers by iteratively replacing each current endmember
+    candidate with the pixel that maximizes the simplex volume in the
+    (K-1)-dimensional PCA-reduced space.
+
+    Parameters
+    ----------
+    Y : np.ndarray or pd.DataFrame, shape (n, J) or (rows, cols, J)
+        Input spectral data.  Can be a 2-D matrix or a 3-D hyperspectral
+        cube; the cube is reshaped to (rows*cols, J) internally.
+        All entries should be nonnegative (typical for spectral data).
+    K : int
+        Number of endmembers to extract.  Must satisfy ``2 ≤ K ≤ J + 1``.
+    max_iter : int, optional
+        Maximum number of full replacement sweeps (each sweep considers
+        swapping every one of the K current endmembers with every pixel).
+        Iteration stops early if no improvement is found.  Default 5.
+    seed : int or None, optional
+        Random seed for reproducibility; used only when ``init="random"``.
+        Default None.
+    normalize : bool, optional
+        If True, L2-normalize each spectrum (row) before PCA projection.
+        Default True.
+    init : {"atgp", "random"}, optional
+        Initialisation strategy when *init_idx* is not provided:
+
+        * ``"atgp"`` – greedy ATGP initialization.
+          Recommended; differs from Winter (1999)'s original random init.
+        * ``"random"`` – K indices drawn uniformly at random (original
+          Winter (1999) behavior).
+
+        Default ``"atgp"``.
+    init_idx : array-like of int, shape (K,) or None, optional
+        If supplied, use these K indices as the starting endmember set and
+        ignore *init*.  All indices must be distinct and in ``[0, n)``.
+        Default None.
+
+    Returns
+    -------
+    E : np.ndarray, shape (K, J)
+        Extracted endmember spectra in the original feature space (rows of
+        the **unnormalized** *Y*).
+    idx : np.ndarray, shape (K,), dtype int
+        Indices of the selected pixels in the flattened data array.
+
+    Raises
+    ------
+    ValueError
+        If *Y* is not 2-D or 3-D, if K is out of range, if n < K, or if
+        *init_idx* is invalid (wrong shape, duplicates, or out-of-bounds
+        indices).
+
+    Notes
+    -----
+    Deviations from Winter (1999):
+
+    * Defaults to ATGP initialization (Plaza et al., 2002) instead of
+      random; pass ``init="random"`` to recover the original behavior.
+    * Optional L2 row-normalization before PCA (``normalize=True`` by
+      default); Winter (1999) does not normalize.
+    * The inner replacement loop is vectorized via the matrix-determinant
+      lemma, scoring all n candidate pixels at once per vertex.  This is
+      mathematically equivalent to the original but O(N·K²) per vertex
+      instead of O(N·K³).  A fallback to explicit volume recomputation
+      is used when the current simplex is numerically degenerate.
+
+    Volume formula (for reference)::
+
+        vol = |det(M)| / (K-1)!,   M = [[Z.T], [1...1]]  ∈ R^{K×K}
+    """
+
+    # process data to a working matrix
+    if isinstance(Y, pd.DataFrame):
+        Y = Y.to_numpy()
+
+    if Y.ndim == 3:
+        rows, cols, J = Y.shape
+        Y2 = Y.reshape(rows * cols, J)
+    elif Y.ndim == 2:
+        Y2 = Y
+    else:
+        raise ValueError("Y must be 2-D (n, J) or 3-D (rows, cols, bands).")
+
+    n, J = Y2.shape
+
+    if normalize:
+        norms = np.linalg.norm(Y2, axis=1, keepdims=True)
+        norms[norms == 0.0] = 1.0
+        Yn = Y2 / norms
+    else:
+        Yn = Y2
+
+    if K < 2:
+        raise ValueError(f"K must be at least 2 (a single endmember has zero simplex volume); got K={K}.")
+    if K > J + 1:
+        raise ValueError(f"K must be ≤ J+1={J + 1} (PCA reduces to K-1={K - 1} dims but data has only J={J} features); got K={K}.")
+    if n < K:
+        raise ValueError(f"Need at least K={K} observations to form a simplex; got n={n}.")
+
+    Z = _pca_reduce(Yn, d=K - 1)    # (n, K-1)
+
+    # initialize K indices
+    rng = np.random.default_rng(seed)
+    if init_idx is not None:
+        idx = np.asarray(init_idx, dtype=int)
+        if idx.shape != (K,):
+            raise ValueError(
+                f"init_idx must have exactly K={K} entries; got shape {idx.shape}."
+            )
+        if len(set(idx.tolist())) != K:
+            raise ValueError("init_idx must not contain duplicate indices.")
+        if idx.min() < 0 or idx.max() >= n:
+            raise ValueError(f"init_idx entries must lie in [0, n={n}).")
+        idx = idx.copy()
+    elif init == "atgp":
+        idx = _init_indices_atgp(Z, K)
+    elif init == "random":
+        idx = _init_indices_random(n, K, rng)
+    else:
+        raise ValueError("init must be 'atgp' or 'random'.")
+
+    M, best_vol = _build_simplex_matrix(Z, idx, K)
+
+    for _ in range(max_iter):
+        improved = False
+        for j in range(K):
+            locked = set(idx.tolist()) - {idx[j]}
+            det_M = np.linalg.det(M)
+            if abs(det_M) < 1e-300:
+                # Degenerate — fall back to explicit volume recomputation
+                best_j = idx[j]
+                trial = idx.copy()
+                for cand in range(n):
+                    if cand in locked:
+                        continue
+                    trial[j] = cand
+                    _, v = _build_simplex_matrix(Z, trial, K)
+                    if v > best_vol:
+                        best_vol = v
+                        best_j = cand
+                if best_j != idx[j]:
+                    idx[j] = best_j
+                    M, best_vol = _build_simplex_matrix(Z, idx, K) # safer to recompute
+                    improved = True
+                continue
+
+            # Vectorized scoring via matrix-determinant lemma:
+            #   det(M_new) = det(M) + (c_n - M[:,j]) · adj(M)[:,j]
+            # where adj(M)[:,j] = det(M) * (M^{-1})[j,:]
+            M_inv = np.linalg.inv(M)
+            adj_col_j = det_M * M_inv[j, :]           # (K,)
+            M_j_dot = float(M[:, j] @ adj_col_j)
+            scores = Z @ adj_col_j[:-1] + adj_col_j[-1]   # (n,)
+            new_dets = det_M + (scores - M_j_dot)
+            new_vols = np.abs(new_dets) / math.factorial(K - 1)
+
+            new_vols[np.array(list(locked), dtype=int)] = -np.inf
+
+            best_j = int(np.argmax(new_vols))
+            if new_vols[best_j] > best_vol:
+                idx[j] = best_j
+                M, best_vol = _build_simplex_matrix(Z, idx, K) # safer to recompute
+                improved = True
+
+        if not improved:
+            break
+
+    return Y2[idx, :], idx
diff --git a/src/geom_nmf/plot.py b/src/geom_nmf/plot.py
new file mode 100644
index 0000000..7153341
--- /dev/null
+++ b/src/geom_nmf/plot.py
@@ -0,0 +1,128 @@
+"""
+geom_nmf.plot
+=============
+Plotting helpers for GeomNMF results.
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def barplot_Phi(
+    Phi: np.ndarray,
+    title: str = "Estimated source attribution (GeomNMF)",
+    feature_labels: list[str] | None = None,
+    feature_colors = None,
+    show_err: bool = True,
+    savepath: str | None = None, 
+) -> None:
+    """
+    Grouped bar plot of the source attribution matrix Phi.
+
+    Each group of bars corresponds to one source (k); within each group
+    there is one bar per feature (j), with height ``Phi[k, j] * 100`` (%).
+    When a bootstrap stack is provided, bars show the mean across replicates
+    and optional error bars show ±1 standard deviation.
+
+    Parameters
+    ----------
+    Phi : array-like
+        Either:
+
+        * shape ``(K, J)``: single Phi matrix.
+        * shape ``(n_reps, K, J)``: bootstrap stack; mean and std are
+          computed across the first axis.
+
+        Values are assumed to be in [0, 1] and are multiplied by 100 for
+        display.
+    title : str, optional
+        Figure title.  Default ``"Estimated source attribution (GeomNMF)"``.
+    feature_labels : list[str] or None, optional
+        Feature labels of length J used in the legend.
+        If None, defaults to ``["Feature 1", ..., "Feature J"]``.
+    feature_colors : None, list, or dict, optional
+        Bar colors per feature.  None uses matplotlib defaults; a list of
+        length J assigns colors by position; a dict maps feature label to
+        color.  Default None.
+    show_err : bool, optional
+        If True and *Phi* is 3D, draw error bars (±1 std across bootstrap
+        reps).  Default True.
+    savepath : str or None, optional
+        File path to save the figure (200 dpi, tight bbox).  Default None.
+    
+
+    Returns
+    -------
+    None
+        Displays the figure (and saves it if *savepath* is given).
+
+    Raises
+    ------
+    ValueError
+        If *Phi* is not 2D or 3D, or if *feature_labels* has the wrong
+        length.
+    """
+    A = np.asarray(Phi)
+    if A.ndim == 2:
+        K, J = A.shape                                          # (K, J)
+        Phi_mean = A * 100                                      # in %
+        Phi_std = None
+    elif A.ndim == 3:
+        n_reps, K, J = A.shape                                  # (n_reps, K, J)
+        Phi_mean = np.nanmean(A, axis=0) * 100                  # (K, J), in %
+        Phi_std  = np.nanstd(A, axis=0, ddof=1 if n_reps > 1 else 0) * 100 if show_err else None
+    else:
+        raise ValueError(f"Expected 2D (K, J) or 3D (n_reps, K, J); got shape {A.shape}")
+
+    # feature labels
+    if feature_labels is None:
+        labels = [f"Feature {j+1}" for j in range(J)]
+    else:
+        if len(feature_labels) != J:
+            raise ValueError(f"feature_labels must have length {J} (got {len(feature_labels)})")
+        labels = list(feature_labels)
+
+    x = np.arange(K)                    # one tick per source
+    width = 0.8 / max(J, 1)             # bar width within each source group
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+
+    for j in range(J):
+        # build error bar kwargs only if we have usable stds
+        kwargs = {}
+        if Phi_std is not None:
+            yerr = Phi_std[:, j]        # (K,) std for feature j across sources
+            if np.any(np.isfinite(yerr)):
+                kwargs["yerr"] = yerr
+                kwargs["error_kw"] = {"elinewidth": 1.2, "capsize": 3}
+
+        # choose color for this feature
+        if feature_colors is None:
+            c = None
+        elif isinstance(feature_colors, dict):
+            c = feature_colors.get(labels[j], None)
+        else:
+            c = feature_colors[j]              
+
+        ax.bar(
+            x + j * width,
+            Phi_mean[:, j],            # (K,) attribution of feature j per source
+            width,
+            color=c,
+            label=labels[j],
+            **kwargs,
+        )
+
+    ax.set_xticks(x + (J - 1) * width / 2)
+    ax.set_xticklabels([f"Source {k+1}" for k in range(K)])
+    ax.set_ylabel("Expected % of pollutants\nattributable to each source")
+    ax.set_title(title)
+    ax.set_ylim(0, 100)
+
+    # Legend outside on the right
+    ax.legend(title="", loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
+    plt.tight_layout()
+
+    if savepath:
+        fig.savefig(savepath, dpi=200, bbox_inches="tight")
+    plt.show()
diff --git a/src/geom_nmf/utils.py b/src/geom_nmf/utils.py
new file mode 100644
index 0000000..76999d0
--- /dev/null
+++ b/src/geom_nmf/utils.py
@@ -0,0 +1,58 @@
+"""
+geom_nmf.utils
+==============
+Algorithmic utilities: endmember permutation matching 
+"""
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+def permute_to_reference(
+    H_ref: np.ndarray,
+    H: np.ndarray,
+    mu: np.ndarray,
+    Phi: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Permute estimated endmembers to best match a reference ordering.
+
+    Uses the Hungarian algorithm to find the permutation of rows of *H*
+    that minimizes total L2 distance to *H_ref*, then applies the same
+    permutation to *mu* and *Phi*.
+
+    Parameters
+    ----------
+    H_ref : np.ndarray, shape (K, J)
+        Reference endmember matrix; rows are sources.
+    H : np.ndarray, shape (K, J)
+        Estimated endmember matrix whose rows are to be permuted.
+    mu : np.ndarray, shape (K,)
+        Estimated mean weight per source; permuted alongside H.
+    Phi : np.ndarray, shape (K, J)
+        Source attribution matrix (rows are sources, columns are features);
+        permuted alongside H.  
+
+    Returns
+    -------
+    H_perm : np.ndarray, shape (K, J)
+        Permuted endmember matrix aligned to H_ref.
+    mu_perm : np.ndarray, shape (K,)
+        Permuted mean weights.
+    Phi_perm : np.ndarray, shape (K, J)
+        Permuted source attribution matrix.
+    order : np.ndarray, shape (K,), dtype int
+        Permutation indices such that ``H_perm = H[order]``.
+    """
+    H_ref = np.asarray(H_ref)
+    H     = np.asarray(H)
+    mu    = np.asarray(mu)
+    Phi   = np.asarray(Phi)
+
+    # pairwise L2 cost: cost[i, j] = ||H_ref[i] - H[j]||
+    cost = np.linalg.norm(H_ref[:, None, :] - H[None, :, :], axis=2)  # (K_ref, K)
+    row_ind, col_ind = linear_sum_assignment(cost)
+
+    # reorder so that estimated rows align with H_ref row order
+    order = col_ind[np.argsort(row_ind)]
+
+    return H[order], mu[order], Phi[order], order
\ No newline at end of file
diff --git a/src/geom_nmf/weights.py b/src/geom_nmf/weights.py
new file mode 100644
index 0000000..d812c7a
--- /dev/null
+++ b/src/geom_nmf/weights.py
@@ -0,0 +1,146 @@
+"""
+geom_nmf.weights
+================
+Intensity (mixing weight) recovery and source attribution matrix computation.
+"""
+
+import warnings
+import numpy as np
+
+
+def compute_Phi(
+    mu: np.ndarray,
+    H: np.ndarray,
+) -> np.ndarray:
+    """
+    Compute the source attribution matrix Phi from source means and H.
+
+    Each entry ``Phi[k, j]`` is the fraction of feature j attributable to
+    source k, weighted by the source mean abundance::
+
+        Phi[k, j] = mu[k] * H[k, j] / sum_k'(mu[k'] * H[k', j])
+
+    Parameters
+    ----------
+    mu : np.ndarray, shape (K,)
+        Mean intensity of each of the K sources.
+    H : np.ndarray, shape (K, J)
+        Endmember matrix; rows are source profiles, columns are features.
+
+    Returns
+    -------
+    Phi : np.ndarray, shape (K, J)
+        Source composition matrix; columns sum to 1 over sources.
+
+    Notes
+    -----
+    If all sources contribute zero to a feature (column of ``mu[:, None] * H``
+    sums to zero), the corresponding column of *Phi* will be NaN.
+    """
+    numerator = mu[:, None] * H          # (K, J)
+    denominator = numerator.sum(axis=0)  # (J,)
+
+    Phi = numerator / denominator        # (K, J)
+    return Phi
+
+
+def estimate_weights(
+    Y: np.ndarray,
+    H: np.ndarray,
+    weight_tol: float = 1e-2,
+    verbose: bool = False,
+) -> tuple[np.ndarray, dict]:
+    """
+    Solve weights (intensity) W such that Y ≈ W H under the affine sum-to-one
+    constraint, via the right pseudoinverse of the augmented system [H | 1].
+
+    After computing ``W_raw = Y_aug @ pinv(H*_aug)`` the rows are projected
+    onto the probability simplex by clipping negatives and renormalizing
+    because rows of Y and rows of H are assumed to sum to 1.  
+
+    Parameters
+    ----------
+    Y : np.ndarray, shape (n, J)
+        Row-normalized observed data matrix; rows are observations, columns are features.
+    H : np.ndarray, shape (K, J) 
+        Row-stochastic endmember matrix; rows are source profiles.
+    weight_tol : float, optional
+        Threshold for flagging rows with large simplex violations (negative
+        weights or sum-deviation above this value).  Default 1e-2.
+    verbose : bool, optional
+        Print diagnostic statistics.  Default False.
+
+    Returns
+    -------
+    W : np.ndarray, shape (n, K)
+        Mixing weights; nonnegative, rows sum to 1.
+    diag : dict
+        Diagnostic information:
+
+        * ``"max_row_sum_dev_H"`` – max deviation of H row sums from 1
+        * ``"rank_H_aug"`` – rank of augmented [H | 1]
+        * ``"cond_G"`` – condition number of H_aug @ H_aug.T; large values
+          mean small errors in H_aug are amplified into large errors in
+          H_aug_R and therefore in W_raw
+        * ``"I_err"`` – ``||H_aug pinv(H_aug) - I||_inf``
+        * ``"aug_resid_inf"`` – ``||Y_aug - W_raw H_aug||_inf``
+        * ``"large_neg_rows"`` – row indices where min W_raw < -weight_tol
+        * ``"large_neg_count"`` – number of such rows
+        * ``"not_sum1_rows"`` – row indices where row sum of W_raw deviates from 1
+          by more than weight_tol
+        * ``"not_sum1_count"`` – number of such rows
+
+    """
+    H_in = np.asarray(H, dtype=float)
+    K, J = H_in.shape
+    n = Y.shape[0]
+
+    # augment to encode sum-to-one constraint
+    H_aug = np.hstack([H_in, np.ones((K, 1))])   # (K, J+1)
+    Y_aug = np.hstack([Y,    np.ones((n, 1))])   # (n, J+1)
+
+    # right inverse via SVD pseudoinverse
+    H_aug_R = np.linalg.pinv(H_aug)              # (J+1, K)
+
+    # raw intensity 
+    W_raw = Y_aug @ H_aug_R                       # (n, K)
+
+    # diagnostics
+    neg_mask  = W_raw.min(axis=1) < -weight_tol                   # large negative weights
+    sum1_mask = np.abs(W_raw.sum(axis=1) - 1.0) > weight_tol      # row sum deviates from 1
+
+    G = H_aug @ H_aug.T 
+    try:
+        condG = float(np.linalg.cond(G)) # large cond_G => small errors in H_aug amplified into large erros in H_aug_R
+    except np.linalg.LinAlgError:
+        condG = float("inf")
+    I_err = float(np.linalg.norm(H_aug @ H_aug_R - np.eye(K), ord=np.inf))
+    aug_resid = float(np.linalg.norm(Y_aug - W_raw @ H_aug, ord=np.inf))
+
+    if verbose:
+        print(f"||H_aug H_aug_R - I||_inf: {I_err:.3e}")
+
+    diag = {
+        "max_row_sum_dev_H": float(np.max(np.abs(H_in.sum(axis=1) - 1.0))),
+        "rank_H_aug": int(np.linalg.matrix_rank(H_aug)),
+        "cond_G": condG,
+        "I_err": I_err,
+        "aug_resid_inf": aug_resid,
+        "large_neg_rows":    np.flatnonzero(neg_mask),
+        "large_neg_count":   int(neg_mask.sum()),
+        "not_sum1_rows":     np.flatnonzero(sum1_mask),
+        "not_sum1_count":    int(sum1_mask.sum()),
+    }
+
+    # clip negatives and renormalize.
+    # note: if Y and H are both row stochastic and the model is well-specified,
+    # W_raw is already row stochastic and this is a no-oeration beyond numerical noise.
+    # under misspecification, clipping introduces bias — use diag to assess severity.
+    W = np.maximum(W_raw, 0.0)
+    s = W.sum(axis=1, keepdims=True)
+    # s == 0 only if every weight in a row is negative — degenerate, shouldn't
+    # occur when Y and H are row stochastic; guard defensively
+    s[s == 0.0] = 1.0
+    W /= s
+
+    return W, diag
diff --git a/tests/test_basics.py b/tests/test_basics.py
deleted file mode 100644
index 91b70b5..0000000
--- a/tests/test_basics.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from geom_nmf import __version__
-
-
-def test_version():
-    assert __version__ == '0.1.0a0'
diff --git a/tests/test_model.py b/tests/test_model.py
index 71d84fe..fc7b28a 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1,74 +1,71 @@
+"""
+Basic smoke test for GeomNMF and barplot_Phi.
+
+Generates synthetic data from a known K=3 simplex mixing model,
+runs GeomNMF, and plots the estimated source attribution matrix.
+"""
+
 import numpy as np
 import pytest
-from geom_nmf import GeomNMF
+from geom_nmf import GeomNMF, barplot_Phi, permute_to_reference
+
+SEED = 123
+K, J, N = 3, 8, 500
+
+@pytest.fixture
+def synthetic_data():
+    rng = np.random.default_rng(SEED)
+    H_true = rng.dirichlet(np.ones(J), size=K)   # (K, J) row-stochastic endmembers
+    W_true = rng.dirichlet(np.ones(K), size=N)    # (N, K) row-stochastic weights
+    Y = W_true @ H_true                            # (N, J) observed data
+    return Y, H_true, W_true
 
 
 @pytest.fixture
-def sample_data():
-    rng = np.random.default_rng(42)
-    X = rng.random((100, 10))
-    y = rng.random(100)
-    return X, y
-
-
-class TestGeomNMFInit:
-    def test_default_params(self):
-        model = GeomNMF()
-        assert model.n_components == 10
-        assert model.max_iter == 200
-        assert model.tol == 1e-4
-        assert model.random_state is None
-
-    def test_custom_params(self):
-        model = GeomNMF(n_components=5, max_iter=100, tol=1e-3, random_state=0)
-        assert model.n_components == 5
-        assert model.max_iter == 100
-        assert model.tol == 1e-3
-        assert model.random_state == 0
-
-    def test_get_params(self):
-        model = GeomNMF(n_components=5)
-        params = model.get_params()
-        assert params["n_components"] == 5
-
-    def test_set_params(self):
-        model = GeomNMF()
-        model.set_params(n_components=3)
-        assert model.n_components == 3
-
-
-class TestGeomNMFFit:
-    def test_fit_returns_self(self, sample_data):
-        X, y = sample_data
-        model = GeomNMF()
-        result = model.fit(X, y)
-        assert result is model
-
-    def test_fit_sets_n_features_in(self, sample_data):
-        X, y = sample_data
-        model = GeomNMF().fit(X, y)
-        assert model.n_features_in_ == X.shape[1]
-
-    def test_fit_sets_is_fitted(self, sample_data):
-        X, y = sample_data
-        model = GeomNMF().fit(X, y)
-        assert model.is_fitted_
-
-    def test_fit_invalid_input(self):
-        model = GeomNMF()
-        with pytest.raises(ValueError):
-            model.fit([[1, 2], [3, 4]], [1])  # mismatched samples
-
-
-class TestGeomNMFPredict:
-    def test_predict_raises_before_fit(self, sample_data):
-        X, _ = sample_data
-        model = GeomNMF()
-        with pytest.raises(Exception):
-            model.predict(X)
-
-    def test_predict_not_implemented(self, sample_data):
-        X, y = sample_data
-        model = GeomNMF().fit(X, y)
-        with pytest.raises(NotImplementedError):
-            model.predict(X)
+def geom_nmf_result(synthetic_data):
+    Y, _, _ = synthetic_data
+    return GeomNMF(Y, K=K, seed=SEED, verbose=False)
+
+
+def test_result_attributes(geom_nmf_result):
+    result = geom_nmf_result
+    assert hasattr(result, "H_star")
+    assert hasattr(result, "Phi")
+    assert hasattr(result, "mu_tilde")
+    assert hasattr(result, "logvol")
+    assert hasattr(result, "elapsed")
+
+
+def test_intrinsic_rank(geom_nmf_result):
+    result = geom_nmf_result
+    assert result.fit_diagnostics["intrinsic_rank"] == K - 1
+
+
+def test_endmember_shape(geom_nmf_result):
+    result = geom_nmf_result
+    assert result.H_star.shape == (K, J)
+
+
+def test_phi_shape(geom_nmf_result):
+    result = geom_nmf_result
+    assert result.Phi.shape == (K, J)
+
+
+def test_alignment_error(synthetic_data, geom_nmf_result):
+    _, H_true, _ = synthetic_data
+    result = geom_nmf_result
+    H_perm, _, _, order = permute_to_reference(
+        H_true, result.H_star, result.mu_tilde, result.Phi
+    )
+    assert len(order) == K
+    errors = np.linalg.norm(H_true - H_perm, axis=1)
+    assert np.all(errors < 0.5), f"Large endmember errors: {errors.round(4)}"
+
+
+def test_barplot_runs(geom_nmf_result):
+    feature_labels = [f"F{j+1}" for j in range(J)]
+    barplot_Phi(
+        geom_nmf_result.Phi,
+        title="Estimated source attribution (GeomNMF)",
+        feature_labels=feature_labels,
+    )
diff --git a/tests/test_viz.py b/tests/test_viz.py
deleted file mode 100644
index 16ccd72..0000000
--- a/tests/test_viz.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import numpy as np
-import pytest
-from geom_nmf import GeomNMF
-from geom_nmf import viz
-
-
-@pytest.fixture
-def fitted_model():
-    rng = np.random.default_rng(42)
-    X = rng.random((100, 10))
-    y = rng.random(100)
-    return GeomNMF().fit(X, y)
-
-
-@pytest.fixture
-def coords():
-    rng = np.random.default_rng(42)
-    return rng.uniform(low=[-180, -90], high=[180, 90], size=(100, 2))
-
-
-class TestPlotComponents:
-    def test_not_implemented(self, fitted_model):
-        with pytest.raises(NotImplementedError):
-            viz.plot_components(fitted_model)
-
-
-class TestPlotReconstructionError:
-    def test_not_implemented(self, fitted_model):
-        with pytest.raises(NotImplementedError):
-            viz.plot_reconstruction_error(fitted_model)
-
-
-class TestPlotSpatialMap:
-    def test_not_implemented(self, fitted_model, coords):
-        with pytest.raises(NotImplementedError):
-            viz.plot_spatial_map(fitted_model, coords)
-
-
-class TestPlotPredictedVsActual:
-    def test_not_implemented(self):
-        y_true = np.random.default_rng(42).random(100)
-        y_pred = np.random.default_rng(0).random(100)
-        with pytest.raises(NotImplementedError):
-            viz.plot_predicted_vs_actual(y_true, y_pred)

From e17964151772c2f7022cde4071558c69cfdee87d Mon Sep 17 00:00:00 2001
From: Bora Jin <jinbora0720@gmail.com>
Date: Fri, 20 Mar 2026 12:56:12 -0400
Subject: [PATCH 2/6] env activate instead of shell for virtual environment

Added instructions for activating the poetry environment.
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index dc388ae..abc85c1 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,11 @@ To activate the virtual environment:
 ```sh
 poetry shell
 ```
+or 
+
+```sh
+poetry env activate
+```
 
 Your prompt will change to indicate you're inside the environment. To exit when you're done:
 

From 907a474f37bfa24bcd1e90e77c1676f63beade12 Mon Sep 17 00:00:00 2001
From: jinbora0720 <jinbora0720@gmail.com>
Date: Fri, 20 Mar 2026 14:57:12 -0400
Subject: [PATCH 3/6] redefine n_clusters as minimum

---
 src/geom_nmf/core.py       | 5 +++--
 src/geom_nmf/endmembers.py | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py
index b42ee0e..9b0a1c4 100644
--- a/src/geom_nmf/core.py
+++ b/src/geom_nmf/core.py
@@ -81,7 +81,8 @@ def GeomNMF(
         * ``"exact"`` – convex-hull vertices via QHull (exact but may be
           slow in high intrinsic dimension).
         * ``"random"`` – random projection extremes (fast approximation;
-          controlled by *n_directions*, *n_top*, *n_candidates*).
+          controlled by *n_directions*, *n_top*, *n_candidates*). When n >> J, 
+          this can be slower than ``"exact"``.
 
         Default ``"exact"``.
     n_directions : int, optional
@@ -98,7 +99,7 @@ def GeomNMF(
         cluster before the exhaustive volume search.  Useful when the
         candidate pool is large.  Default False.
     n_clusters : int or None, optional
-        Number of clusters when *prune=True*.  Defaults to ``10*K``.
+        Minimum number of clusters when *prune=True*.  Defaults to ``10*K``.
     refine_greedy : bool, optional
         If True, run one pass of N-FINDR greedy swaps on the initial
         solution to try to improve the simplex volume.  Only meaningful when
diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py
index 8bee0d3..2650931 100644
--- a/src/geom_nmf/endmembers.py
+++ b/src/geom_nmf/endmembers.py
@@ -143,15 +143,15 @@ def prune_close_points(
  
     The number of clusters is chosen automatically by maximizing the
     silhouette score over the range
-    ``[max(2, n_clusters // 2), min(2 * n_clusters, m - 1)]``,
-    centered around *n_clusters*.
+    ``[max(2, n_clusters), min(2 * n_clusters, m - 1)]``,
+    with *n_clusters* as the lower bound.
  
     Parameters
     ----------
     points : np.ndarray, shape (m, J)
         Points to prune.  Must be finite (no NaN or Inf).
     n_clusters : int, optional
-        Target number of representatives (clusters) to retain after pruning.  The
+        Minimum number of representatives (clusters) to retain after pruning.  The
         silhouette search explores a range around this value, so the actual
         number returned may differ slightly.  Default 25.
     seed : int, optional
@@ -179,7 +179,7 @@ def prune_close_points(
     except ImportError:
         return X, list(range(m))
  
-    lower = max(2, n_clusters // 2)
+    lower = max(2, n_clusters)
     upper = min(2 * n_clusters, m - 1)
  
     if lower > upper:

From 367b6cec5e22f10624ecefa1817cbc90706fe609 Mon Sep 17 00:00:00 2001
From: jinbora0720 <jinbora0720@gmail.com>
Date: Fri, 20 Mar 2026 16:31:36 -0400
Subject: [PATCH 4/6] tighten the search interval for n_clusters and add more
 explicit dosctring

---
 src/geom_nmf/core.py       |  4 +++-
 src/geom_nmf/endmembers.py | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/geom_nmf/core.py b/src/geom_nmf/core.py
index 9b0a1c4..a731022 100644
--- a/src/geom_nmf/core.py
+++ b/src/geom_nmf/core.py
@@ -99,7 +99,9 @@ def GeomNMF(
         cluster before the exhaustive volume search.  Useful when the
         candidate pool is large.  Default False.
     n_clusters : int or None, optional
-        Minimum number of clusters when *prune=True*.  Defaults to ``10*K``.
+        Target minimum number of clusters when *prune=True*.  The actual number
+        returned may be lower if empty clusters reduce the count.  
+        Defaults to ``10*K``.
     refine_greedy : bool, optional
         If True, run one pass of N-FINDR greedy swaps on the initial
         solution to try to improve the simplex volume.  Only meaningful when
diff --git a/src/geom_nmf/endmembers.py b/src/geom_nmf/endmembers.py
index 2650931..4decc54 100644
--- a/src/geom_nmf/endmembers.py
+++ b/src/geom_nmf/endmembers.py
@@ -143,7 +143,7 @@ def prune_close_points(
  
     The number of clusters is chosen automatically by maximizing the
     silhouette score over the range
-    ``[max(2, n_clusters), min(2 * n_clusters, m - 1)]``,
+    ``[max(2, n_clusters), min(int(1.5 * n_clusters), m - 1)]``,
     with *n_clusters* as the lower bound.
  
     Parameters
@@ -151,9 +151,9 @@ def prune_close_points(
     points : np.ndarray, shape (m, J)
         Points to prune.  Must be finite (no NaN or Inf).
     n_clusters : int, optional
-        Minimum number of representatives (clusters) to retain after pruning.  The
-        silhouette search explores a range around this value, so the actual
-        number returned may differ slightly.  Default 25.
+        Target minimum number of representatives to retain after pruning.
+        The silhouette search starts at this value, but the actual number
+        returned may be lower if empty clusters reduce the count.  Default 25.
     seed : int, optional
         Random seed passed to MiniBatchKMeans.  Default 123.
  
@@ -180,7 +180,7 @@ def prune_close_points(
         return X, list(range(m))
  
     lower = max(2, n_clusters)
-    upper = min(2 * n_clusters, m - 1)
+    upper = min(int(1.5 * n_clusters), m - 1)
  
     if lower > upper:
         return X, list(range(m))

From e06d349b90e7b1621d9aced36a0c0cb2361108df Mon Sep 17 00:00:00 2001
From: jinbora0720 <jinbora0720@gmail.com>
Date: Wed, 25 Mar 2026 23:44:50 -0400
Subject: [PATCH 5/6] delete useless text

---
 src/geom_nmf/nfindr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py
index 8ba7d79..dc07f45 100644
--- a/src/geom_nmf/nfindr.py
+++ b/src/geom_nmf/nfindr.py
@@ -7,7 +7,7 @@
 Reference
 ---------
 Winter, M. E. (1999). N-FINDR: An algorithm for fast autonomous spectral
-end-member determination in hyperspectral data.  *Proc. SPIE 3753*.
+end-member determination in hyperspectral data. 
 """
 
 import math

From 54074f5ea0b6bba58f800664dbe293623b291f5a Mon Sep 17 00:00:00 2001
From: jinbora0720 <jinbora0720@gmail.com>
Date: Wed, 25 Mar 2026 23:52:32 -0400
Subject: [PATCH 6/6] fixed ATGP reference

---
 src/geom_nmf/nfindr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/geom_nmf/nfindr.py b/src/geom_nmf/nfindr.py
index dc07f45..9076a06 100644
--- a/src/geom_nmf/nfindr.py
+++ b/src/geom_nmf/nfindr.py
@@ -205,7 +205,7 @@ def nfindr(
     -----
     Deviations from Winter (1999):
 
-    * Defaults to ATGP initialization (Plaza et al., 2002) instead of
+    * Defaults to ATGP initialization (Plaza and Chang, 2006) instead of
       random; pass ``init="random"`` to recover the original behavior.
     * Optional L2 row-normalization before PCA (``normalize=True`` by
       default); Winter (1999) does not normalize.