From a608eb4da8995715285c889bc90d0f9d8dd6b823 Mon Sep 17 00:00:00 2001
From: Nick Kinney <Nick.Kinney4@gmail.com>
Date: Sat, 30 May 2026 20:03:29 -0400
Subject: [PATCH 1/3] feat(data): centralize sample-dataset metadata in a
 manifest (#774)

Sample-dataset metadata was duplicated across four places: the long
if/elif chain in load_sample, the tests, docs/library/sample_data.md,
and the per-file include list in MANIFEST.in. Adding a sample meant
editing all four by hand, and they had already drifted (sample_data.md
listed 23 of 46 datasets with several typo-d names; MANIFEST.in listed
22, so the sdist shipped incomplete).

Introduce chainladder/utils/data/_manifest.py as the single source of
truth (a plain Python dict, no new dependency), and key everything off
it:

- load_sample now looks up its Triangle config from the manifest instead
  of the if/elif chain. Verified behavior-preserving: the resolved
  origin/development/index/columns/cumulative for all 46 bundled samples
  is byte-identical before and after.
- New public cl.list_samples() returns a DataFrame of name, index,
  columns, cumulative, and (optionally) grain + period counts. Doubles
  as the source for the docs table.
- test_load_sample iterates the manifest rather than globbing the data
  directory, so adding a sample is a one-entry change and stray non-CSV
  files cannot be mistaken for datasets. Added a both-ways sync assertion
  (manifest == CSVs on disk) and a test_list_samples test.
- MANIFEST.in collapses the 22 hand-listed includes to one
  recursive-include chainladder/utils/data *.csv. Verified the built
  sdist now contains all 46 CSVs (was 22).
- docs/library/sample_data.md is regenerated from cl.list_samples() via
  scripts/regen_sample_data_docs.py; the table is now complete and
  accurate.

Closes #774.
---
 MANIFEST.in                               |  23 +-
 chainladder/utils/__init__.py             |   1 +
 chainladder/utils/data/_manifest.py       | 355 ++++++++++++++++++++++
 chainladder/utils/tests/test_utilities.py |  55 +++-
 chainladder/utils/utility_functions.py    | 246 ++++++---------
 docs/library/sample_data.md               | 101 +++---
 scripts/regen_sample_data_docs.py         |  88 ++++++
 7 files changed, 623 insertions(+), 246 deletions(-)
 create mode 100644 chainladder/utils/data/_manifest.py
 create mode 100644 scripts/regen_sample_data_docs.py

diff --git a/MANIFEST.in b/MANIFEST.in
index 0eed2f16..b0773a5f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,25 +2,4 @@ include LICENSE
 include requirements.txt
 include README.rst
 
-include chainladder/utils/data/abc.csv
-include chainladder/utils/data/auto.csv
-include chainladder/utils/data/clrd.csv
-include chainladder/utils/data/clrd2025.csv
-include chainladder/utils/data/genins.csv
-include chainladder/utils/data/liab.csv
-include chainladder/utils/data/m3ir5.csv
-include chainladder/utils/data/mcl.csv
-include chainladder/utils/data/mortgage.csv
-include chainladder/utils/data/mw2008.csv
-include chainladder/utils/data/mw2014.csv
-include chainladder/utils/data/quarterly.csv
-include chainladder/utils/data/raa.csv
-include chainladder/utils/data/ukmotor.csv
-include chainladder/utils/data/usaa.csv
-include chainladder/utils/data/usauto.csv
-include chainladder/utils/data/cc_sample.csv
-include chainladder/utils/data/ia_sample.csv
-include chainladder/utils/data/prism.csv
-include chainladder/utils/data/tail_sample.csv
-include chainladder/utils/data/berqsherm.csv
-include chainladder/utils/data/xyz.csv
\ No newline at end of file
+recursive-include chainladder/utils/data *.csv
diff --git a/chainladder/utils/__init__.py b/chainladder/utils/__init__.py
index 1800834f..3f665072 100644
--- a/chainladder/utils/__init__.py
+++ b/chainladder/utils/__init__.py
@@ -12,6 +12,7 @@
     read_json,
     concat,
     load_sample,
+    list_samples,
     minimum,
     maximum,
     PatsyFormula,
diff --git a/chainladder/utils/data/_manifest.py b/chainladder/utils/data/_manifest.py
new file mode 100644
index 00000000..ff91a191
--- /dev/null
+++ b/chainladder/utils/data/_manifest.py
@@ -0,0 +1,355 @@
+"""Central registry of bundled sample datasets.
+
+Single source of truth for the metadata of every CSV in
+``chainladder/utils/data/``. Consumed by:
+
+* :func:`chainladder.load_sample` -- to build the ``Triangle`` for a sample.
+* :func:`chainladder.list_samples` -- to list available samples.
+* ``docs/library/sample_data.md`` -- regenerated from this registry.
+* ``MANIFEST.in`` -- ships ``chainladder/utils/data/*.csv`` via a wildcard.
+
+Adding a new sample dataset is a one-entry change here (plus dropping the
+CSV in ``chainladder/utils/data/``); ``load_sample``, the docs table, and the
+tests all key off this dict, so the metadata no longer has to be repeated in
+three places.
+
+Each entry maps the sample name (the CSV filename without extension, lower
+case) to the keyword arguments passed to ``Triangle``:
+
+``origin``
+    Column name(s) for the origin period.
+``development``
+    Column name(s) for the development period.
+``index``
+    Column name(s) used as the Triangle index, or ``None``.
+``columns``
+    Measure column name(s) loaded into the Triangle.
+``cumulative``
+    ``True`` if the measures are cumulative, ``False`` if incremental.
+"""
+
+SAMPLES: dict = {
+
+    "abc": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "auto": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": ['lob'],
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "berqsherm": {
+        "origin": 'AccidentYear',
+        "development": 'DevelopmentYear',
+        "index": ['LOB'],
+        "columns": ['Incurred', 'Paid', 'Reported', 'Closed'],
+        "cumulative": True,
+    },
+    "cc_sample": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['loss', 'exposure'],
+        "cumulative": True,
+    },
+    "clrd": {
+        "origin": 'AccidentYear',
+        "development": 'DevelopmentYear',
+        "index": ['GRNAME', 'LOB'],
+        "columns": ['IncurLoss', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'],
+        "cumulative": True,
+    },
+    "clrd2025": {
+        "origin": 'AccidentYear',
+        "development": 'DevelopmentYear',
+        "index": ['GRNAME', 'LOB'],
+        "columns": ['IncurredLosses', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'],
+        "cumulative": True,
+    },
+    "friedland_auto_bi_insurer": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_auto_freq_sev": {
+        "origin": 'Accident Half-Year',
+        "development": 'Calendar Half-Year',
+        "index": None,
+        "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severity'],
+        "cumulative": True,
+    },
+    "friedland_auto_salsub": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Salvage and Subrogation', 'Received Salvage and Subrogation', 'Reported Claims', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_autoprop": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported ALAE', 'Paid ALAE', 'Reported Claims', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_berq_sher_auto": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate'],
+        "cumulative": True,
+    },
+    "friedland_gl_insurer": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_med_mal": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Claims', 'Paid Claims', 'Case Outstanding', 'Open Claim Counts'],
+        "cumulative": True,
+    },
+    "friedland_qs": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Net to Gross'],
+        "cumulative": True,
+    },
+    "friedland_us_auto_chg_prod_mix": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_us_auto_incr_claim": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_us_auto_steady_state": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_us_industry_auto": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_us_industry_auto_case": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Case Outstanding', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_uspp_auto_increasing_case": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'],
+        "cumulative": True,
+    },
+    "friedland_uspp_auto_increasing_claim": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'],
+        "cumulative": True,
+    },
+    "friedland_uspp_auto_steady_state": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'],
+        "cumulative": True,
+    },
+    "friedland_uspp_increasing_claim_case": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'],
+        "cumulative": True,
+    },
+    "friedland_wc_self_insurer": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Paid Claims', 'Paid Severities', 'Reported Claims', 'Reported Severities'],
+        "cumulative": True,
+    },
+    "friedland_xol": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Ceded Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_xyz_auto_bi": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Paid Claims', 'Reported Claims'],
+        "cumulative": True,
+    },
+    "friedland_xyz_case": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Case Outstanding', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_xyz_disp": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Disposal Rate', 'Closed Claim Counts', 'Paid Claims'],
+        "cumulative": True,
+    },
+    "friedland_xyz_freq_sev": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severities'],
+        "cumulative": True,
+    },
+    "genins": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "ia_sample": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['loss', 'exposure'],
+        "cumulative": True,
+    },
+    "liab": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": ['lob'],
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "m3ir5": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "mack_1997": {
+        "origin": 'Accident Year',
+        "development": 'Calendar Year',
+        "index": None,
+        "columns": ['Case Incurred'],
+        "cumulative": True,
+    },
+    "mcl": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "mortgage": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "mw2008": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "mw2014": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "prism": {
+        "origin": 'AccidentDate',
+        "development": 'PaymentDate',
+        "index": ['ClaimNo', 'Line', 'Type', 'ClaimLiability', 'Limit', 'Deductible'],
+        "columns": ['reportedCount', 'closedPaidCount', 'Paid', 'Incurred'],
+        "cumulative": False,
+    },
+    "quarterly": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "raa": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "tail_sample": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "ukmotor": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['values'],
+        "cumulative": True,
+    },
+    "usaa": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "usauto": {
+        "origin": 'origin',
+        "development": 'development',
+        "index": None,
+        "columns": ['incurred', 'paid'],
+        "cumulative": True,
+    },
+    "xyz": {
+        "origin": 'AccidentYear',
+        "development": 'DevelopmentYear',
+        "index": None,
+        "columns": ['Incurred', 'Paid', 'Reported', 'Closed', 'Premium'],
+        "cumulative": True,
+    },
+}
diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py
index 0aeffd27..bc68abfc 100644
--- a/chainladder/utils/tests/test_utilities.py
+++ b/chainladder/utils/tests/test_utilities.py
@@ -8,6 +8,7 @@
     __dt64_unit__
 )
 from chainladder.utils.utility_functions import date_delta_adjustment
+from chainladder.utils.data._manifest import SAMPLES
 from pathlib import Path
 
 
@@ -130,23 +131,51 @@ def test_invalid_sample() -> None:
 
 def test_load_sample() -> None:
     """
-    Tests whether the supported sample data sets load.
-    """
+    Tests whether every sample data set declared in the manifest loads.
 
-    # Get the folder containing the datasets.
-    data_dir: Path = Path(__file__).parent.parent / 'data'
+    Iterating over the manifest (rather than globbing the data directory)
+    means adding a new sample is a one-entry change in
+    ``chainladder/utils/data/_manifest.py`` and this test picks it up
+    automatically, while non-data files in the folder (``__init__.py``,
+    ``_manifest.py``) are never mistaken for datasets.
+    """
+    # Every manifest entry must load and have a matching CSV on disk.
+    data_dir: Path = Path(__file__).parent.parent / "data"
+    for dataset in SAMPLES:
+        assert (data_dir / f"{dataset}.csv").is_file(), (
+            f"manifest lists '{dataset}' but {dataset}.csv is missing"
+        )
+        cl.load_sample(dataset)
 
-    # Files to exclude from cl.load_sample().
-    files_to_excl: list = [
-        '__init__'
-    ]
+    # Conversely, every CSV on disk must be declared in the manifest, so a
+    # newly added data file can't silently go unregistered.
+    csv_stems = {f.stem for f in data_dir.glob("*.csv")}
+    assert csv_stems == set(SAMPLES), (
+        "manifest and data directory are out of sync: "
+        f"only in dir={csv_stems - set(SAMPLES)}, "
+        f"only in manifest={set(SAMPLES) - csv_stems}"
+    )
 
-    # Gather list of files to test.
-    datasets = [f.stem for f in data_dir.iterdir() if f.is_file() and f.stem not in files_to_excl]
 
-    # Load each file.
-    for dataset in datasets:
-        cl.load_sample(dataset)
+def test_list_samples() -> None:
+    """
+    Tests cl.list_samples(): the manifest-driven catalog of bundled datasets.
+    """
+    df = cl.list_samples()
+    # One row per manifest entry, indexed by sample name.
+    assert df.index.name == "name"
+    assert set(df.index) == set(SAMPLES)
+    assert {"index", "columns", "cumulative", "origin_grain", "development_grain"} <= set(df.columns)
+
+    # The fast path skips loading data and therefore omits the grain columns.
+    fast = cl.list_samples(include_grain=False)
+    assert set(fast.index) == set(SAMPLES)
+    assert "origin_grain" not in fast.columns
+    assert "development_grain" not in fast.columns
+
+    # Metadata matches the manifest source of truth.
+    assert df.loc["clrd2025", "columns"] == SAMPLES["clrd2025"]["columns"]
+    assert df.loc["prism", "cumulative"] == SAMPLES["prism"]["cumulative"]
 
 
 def test_load_sample_clrd2025() -> None:
diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py
index a124a37f..cd0b892e 100644
--- a/chainladder/utils/utility_functions.py
+++ b/chainladder/utils/utility_functions.py
@@ -15,6 +15,7 @@
     __dt64_dtype__
 )
 from chainladder.utils.sparse import sp
+from chainladder.utils.data._manifest import SAMPLES
 from io import StringIO
 from patsy import dmatrix  # noqa
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -43,7 +44,7 @@ def load_sample(key: str, *args, **kwargs) -> Triangle:
 
         Datasets that are commonly used in examples are: raa, clrd, and prism.
 
-        And a complete list of available datasets is: abc, auto, berqsherm, cc_sample, clrd, clrd2025, genins, ia_sample, liab, m3ir5, mack_1997, mcl, mortgage, mw2008, mw2014, prism, quarterly, raa, tail_sample, ukmotor, usaa, usauto, xyz.
+        For the complete list of available datasets, call :func:`list_samples`.
 
     Returns
     -------
@@ -86,10 +87,10 @@ def load_sample(key: str, *args, **kwargs) -> Triangle:
     # Set base path to be the parent directory of this file, e.g., the utils folder.
     utils_path: AnyStr = os.path.dirname(os.path.abspath(__file__))
 
-    # Validate that the file indicated by the key argument exists.
-    dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv")
-
-    if not os.path.exists(dataset_path):
+    # Validate the key against the sample-dataset manifest. The manifest is the
+    # authoritative list of available samples; every entry has a matching CSV in
+    # the data folder.
+    if key.lower() not in SAMPLES:
         raise ValueError(
             """
             Invalid key supplied. The key should match the name, without extension, of one of the file names
@@ -102,162 +103,19 @@ def load_sample(key: str, *args, **kwargs) -> Triangle:
             )
         )
 
-    # Set initial values for arguments to Triangle __init__. These may be overridden by
-    # values specific to the data set.
-    origin: str = "origin"
-    development: str = "development"
-    columns: list = ["values"]
-    index: list | None = None
-    cumulative: bool = True
-
-    if key.lower() in ["mcl", "usaa", "quarterly", "auto", "usauto", "tail_sample"]:
-        columns: list = ["incurred", "paid"]
-    if key.lower() == "clrd":
-        origin: str = "AccidentYear"
-        development: str = "DevelopmentYear"
-        index: list = ["GRNAME", "LOB"]
-        columns: list = [
-            "IncurLoss",
-            "CumPaidLoss",
-            "BulkLoss",
-            "EarnedPremDIR",
-            "EarnedPremCeded",
-            "EarnedPremNet",
-        ]
-    if key.lower() == "clrd2025":
-        origin: str = "AccidentYear"
-        development: str = "DevelopmentYear"
-        index: list = ["GRNAME", "LOB"]
-        columns: list = [
-            "IncurredLosses",
-            "CumPaidLoss",
-            "BulkLoss",
-            "EarnedPremDIR",
-            "EarnedPremCeded",
-            "EarnedPremNet",
-        ]
-    if key.lower() == "berqsherm":
-        origin: str = "AccidentYear"
-        development: str = "DevelopmentYear"
-        index: list = ["LOB"]
-        columns: list = ["Incurred", "Paid", "Reported", "Closed"]
-    if key.lower() == "xyz":
-        origin: str = "AccidentYear"
-        development: str = "DevelopmentYear"
-        columns: list = ["Incurred", "Paid", "Reported", "Closed", "Premium"]
-    if key.lower() in ["liab", "auto"]:
-        index: list = ["lob"]
-    if key.lower() in ["cc_sample", "ia_sample"]:
-        columns: list = ["loss", "exposure"]
-    if key.lower() in ["prism"]:
-        columns: list = ["reportedCount", "closedPaidCount", "Paid", "Incurred"]
-        index: list = [
-            "ClaimNo",
-            "Line",
-            "Type",
-            "ClaimLiability",
-            "Limit",
-            "Deductible",
-        ]
-        origin: str = "AccidentDate"
-        development: str = "PaymentDate"
-        cumulative: bool = False
-    if "mack_1997" in key.lower():
-        columns = ["Case Incurred"]
-        origin = "Accident Year"
-        development = "Calendar Year"
-        cumulative: bool = True
-    # Friedland datasets
-    if "friedland" in key.lower():
-        columns: list = ["Paid Claims", "Reported Claims"]
-        origin: str = "Accident Year"
-        development: str = "Calendar Year"
-        cumulative: bool = True
-        index: None = None
-        if "autoprop" in key.lower():
-            columns: list = [
-                "Reported ALAE",
-                "Paid ALAE",
-                "Reported Claims",
-                "Paid Claims",
-            ]
-        if "auto_salsub" in key.lower():
-            columns: list = [
-                "Reported Salvage and Subrogation",
-                "Received Salvage and Subrogation",
-                "Reported Claims",
-                "Paid Claims",
-            ]
-        if "berq_sher_auto" in key.lower():
-            columns: list = [
-                "Paid Claims",
-                "Closed Claim Counts",
-                "Reported Claim Counts",
-                "Disposal Rate",
-            ]
-        if "gl_insurer" in key.lower():
-            columns: list = [
-                "Closed Claim Counts",
-                "Reported Claim Counts",
-                "Disposal Rate",
-                "Paid Claims",
-            ]
-        if "med_mal" in key.lower():
-            columns: list = [
-                "Reported Claims",
-                "Paid Claims",
-                "Case Outstanding",
-                "Open Claim Counts",
-            ]
-        if "qs" in key.lower():
-            columns: list = [
-                "Gross Reported Claims",
-                "Net Reported Claims",
-                "Net to Gross",
-            ]
-        if "auto_case" in key.lower():
-            columns: list = ["Case Outstanding", "Paid Claims"]
-        if "wc_self_insurer" in key.lower():
-            columns: list = [
-                "Closed Claim Counts",
-                "Reported Claim Counts",
-                "Paid Claims",
-                "Paid Severities",
-                "Reported Claims",
-                "Reported Severities",
-            ]
-        if "xol" in key.lower():
-            columns: list = [
-                "Gross Reported Claims",
-                "Net Reported Claims",
-                "Ceded Reported Claims",
-            ]
-        if "xyz_case" in key.lower():
-            columns: list = ["Case Outstanding", "Paid Claims"]
-        if "xyz_disp" in key.lower():
-            columns: list = ["Disposal Rate", "Closed Claim Counts", "Paid Claims"]
-        if "xyz_freq_sev" in key.lower():
-            columns: list = [
-                "Closed Claim Counts",
-                "Reported Claim Counts",
-                "Reported Claims",
-                "Reported Severities",
-            ]
-        if "auto_freq_sev" in key.lower():
-            columns: list = [
-                "Closed Claim Counts",
-                "Reported Claim Counts",
-                "Reported Claims",
-                "Reported Severity",
-            ]
-            origin: str = "Accident Half-Year"
-            development: str = "Calendar Half-Year"
-        if "uspp" in key.lower():
-            columns: list = [
-                "Reported Claims",
-                "Paid Claims",
-                "Earned Premium"
-            ]
+    dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv")
+
+    # Look up the Triangle configuration for this sample from the central
+    # manifest (chainladder/utils/data/_manifest.py). The manifest is the
+    # single source of truth for sample-dataset metadata, replacing the long
+    # per-dataset if/elif chain that previously lived here and duplicated the
+    # column names already present in the tests and docs/library/sample_data.md.
+    config: dict = SAMPLES[key.lower()]
+    origin = config["origin"]
+    development = config["development"]
+    index = config["index"]
+    columns = config["columns"]
+    cumulative = config["cumulative"]
 
     df = pd.read_csv(filepath_or_buffer=dataset_path)
 
@@ -273,6 +131,72 @@ def load_sample(key: str, *args, **kwargs) -> Triangle:
     )
 
 
+# Human-readable labels for the single-character grain codes a Triangle exposes
+# via ``origin_grain`` / ``development_grain``.
+_GRAIN_LABELS: dict = {
+    "Y": "Annual",
+    "S": "Semiannual",
+    "Q": "Quarter",
+    "M": "Month",
+}
+
+
+def list_samples(include_grain: bool = True) -> DataFrame:
+    """List the sample datasets bundled with the chainladder package.
+
+    The returned table is driven by the sample-dataset manifest
+    (``chainladder/utils/data/_manifest.py``), the same source
+    :func:`load_sample` reads, so it always reflects exactly what is loadable.
+
+    Parameters
+    ----------
+    include_grain: bool
+        If ``True`` (default), load each sample to report its origin and
+        development grain (and the number of origin/development periods). This
+        is the slower path because every Triangle is built. Set to ``False`` to
+        return just the manifest metadata (name, index, columns, cumulative)
+        without loading any data.
+
+    Returns
+    -------
+        pandas.DataFrame indexed by sample name, with columns ``index``,
+        ``columns``, ``cumulative`` and, when ``include_grain`` is ``True``,
+        ``origin_grain``, ``development_grain``, ``origin_periods`` and
+        ``development_periods``.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import chainladder as cl
+        cl.list_samples()                    # full table, grain included
+        cl.list_samples(include_grain=False) # fast, metadata only
+    """
+    records: list = []
+    for name in sorted(SAMPLES):
+        config: dict = SAMPLES[name]
+        record: dict = {
+            "name": name,
+            "index": config["index"],
+            "columns": config["columns"],
+            "cumulative": config["cumulative"],
+        }
+        if include_grain:
+            triangle = load_sample(name)
+            record["origin_grain"] = _GRAIN_LABELS.get(
+                triangle.origin_grain, triangle.origin_grain
+            )
+            record["development_grain"] = _GRAIN_LABELS.get(
+                triangle.development_grain, triangle.development_grain
+            )
+            record["origin_periods"] = len(triangle.origin)
+            record["development_periods"] = triangle.development.shape[0]
+        records.append(record)
+
+    return pd.DataFrame.from_records(records).set_index("name")
+
+
 def read_pickle(path):
     with open(path, "rb") as pkl:
         return dill.load(pkl)
diff --git a/docs/library/sample_data.md b/docs/library/sample_data.md
index b6a7dc07..1b8882f7 100644
--- a/docs/library/sample_data.md
+++ b/docs/library/sample_data.md
@@ -4,54 +4,55 @@ Below is the list of all datasets that come included with the `chainladder` pack
 
 You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`.
 
-\* Denotes datasets that are more interesting and possess unique characteristics.
+This table is generated from the sample-dataset manifest
+(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it,
+run `python scripts/regen_sample_data_docs.py` from the repository root.
 
-
-| Dataset Name                     | Indexes                                                  | Columns                                                                                                          | Origin Grain             | Development Grain        |
-|----------------------------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------|--------------------------|
-| abc                              | (none)                                                   | (none)                                                                                                           | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| auto                             | [lob]                                                    | [incurred, paid]                                                                                                 | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| berqsherm                        | [LOB]                                                    | [Incurred, Paid, Reported, Closed]                                                                               | Annual (8 Yrs)           | Annual (8 Yrs)           |
-| cc_sample                        | [Total]                                                  | [loss, exposure]                                                                                                 | Annual (5 Yrs)           | Annual (5 Yrs)           |
-| clrd *                           | [GRNAME, LOB]                                            | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded,   EarnedPremNet]                              | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| clrd2025 *                       | [GRNAME, LOB]                                            | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet]                           | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_auto_bi_insurer*       | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (9 Yrs)           | Annual (9 Yrs)           | 
-| friedland_auto_freq_sev*         | (none)                                                   | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity]                                 | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) | 
-| friedland_autoprop               | (none)                                                   | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims]                                                         | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| friedland_auto_salsub            | (none)                                                   | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims]               | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| friedland_berq_sher_auto         | (none)                                                   | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate]                                         | Annual (8 Yrs)           | Annual (8 Yrs)           | 
-| friedland_gl_insurer             | (none)                                                   | [Closed Claim Counts, Reported Claim Counts, Disposal Rate Paid Claims]                                          | Annual (8 Yrs)           | Annual (8 Yrs)           |
-| friedland_med_mal                | (none)                                                   | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts]                                              | Annual (8 Yrs)           | Annual (8 Yrs)           |
-| friedland_qs                     | (none)                                                   | [Gross Reported Claims, Net Reported Claims, Net to Gross]                                                       | Annual (4 Yrs)           | Annual (4 Yrs)           |
-| friedland_us_auto_chg_prod_mix   | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_us_auto_incr_claim     | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_us_auto_steady_state   | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_us_industry_auto       | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_us_industy_auto_case   | (none)                                                   | [Case Outstanding, Paid Claims]                                                                                  | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_uspp_auto_incr_case    | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_uspp_auto_incr_claim   | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| frieldand_uspp_auto_steady_state | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_uspp_incr_claim_case   | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| friedland_wc_self_insurer        | (none)                                                   | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs)           | Annual (10 Yrs)          |
-| friedland_xol                    | (none)                                                   | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims]                                              | Annual (4 Yrs)           | Annual (4 Yrs)           |
-| friedland_xyz_auto_bi*           | (none)                                                   | [Paid Claims, Reported Claims]                                                                                   | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| friedland_xyz_case*              | (none)                                                   | [Case Outstanding, Paid Claims]                                                                                  | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| friedland_xyz_disp               | (none)                                                   | [Disposal Rate, Closed Claim Counts, Paid Claims]                                                                | Annual (8 Yrs)           | Annual (8 Yrs)           |
-| friedland_xyz_freq_sev*          | (none)                                                   | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities]                               | Annual (11 Yrs)          | Annual (11 Yrs)          |
-| genins                           | (none)                                                   | (none)                                                                                                           | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| ia_sample                        | [Total]                                                  | [loss, exposure]                                                                                                 | Annual (6 Yrs)           | Annual (6 Yrs)           |
-| liab *                           | [lob]                                                    | [values]                                                                                                         | Annual (14 Yrs)          | Annual (14 Yrs)          |
-| mack_1997                        | (none)                                                   | [Case Incurred]                                                                                                  | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| m3ir5                            | (none)                                                   | (none)                                                                                                           | Annual (14 Yrs)          | Annual (14 Yrs)          |
-| mcl                              | [Total]                                                  | [incurred,  paid]                                                                                                | Annual (7 Yrs)           | Annual (7 Yrs)           |
-| mortgage                         | (none)                                                   | (none)                                                                                                           | Annual (9 Yrs)           | Annual (9 Yrs)           |
-| mw2008                           | (none)                                                   | (none)                                                                                                           | Annual (9 Yrs)           | Annual (9 Yrs)           |
-| mw2014                           | (none)                                                   | (none)                                                                                                           | Annual (17 Yrs)          | Annual (17 Yrs)          |
-| prism *                          | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred]                                                                 | Month (120 months)       | Month (120 months)       |
-| quarterly *                      | [Total]                                                  | [incurred,  paid]                                                                                                | Annual (12 Yrs)          | Quarter (45 Qtrs)        |
-| raa                              | (none)                                                   | (none)                                                                                                           | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| tail_sample                      | [Total]                                                  | [incurred, paid]                                                                                                 | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| ukmotor                          | (none)                                                   | (none)                                                                                                           | Annual (7 Yrs)           | Annual (7 Yrs)           |
-| usaa                             | [Total]                                                  | [incurred,  paid]                                                                                                | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| usauto                           | [Total]                                                  | [incurred,  paid]                                                                                                | Annual (10 Yrs)          | Annual (10 Yrs)          |
-| xyz*                             | [Total]                                                  | [Incurred, Paid, Reported, Closed, Premium]                                                                      | Annual (11 Yrs)          | Annual (11 Yrs)          |
+| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |
+|---|---|---|---|---|
+| abc | (none) | [values] | Annual (11 Yrs) | Annual (11 Yrs) |
+| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
+| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) |
+| cc_sample | (none) | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) |
+| clrd | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) |
+| clrd2025 | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (19 Yrs) | Annual (19 Yrs) |
+| friedland_auto_bi_insurer | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) |
+| friedland_auto_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) |
+| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
+| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
+| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) |
+| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) |
+| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) |
+| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) |
+| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_us_industry_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_uspp_auto_increasing_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_uspp_auto_increasing_claim | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_uspp_auto_steady_state | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_uspp_increasing_claim_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
+| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (8 Yrs) |
+| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) |
+| friedland_xyz_auto_bi | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
+| friedland_xyz_case | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
+| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) |
+| friedland_xyz_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) |
+| genins | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) |
+| ia_sample | (none) | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) |
+| liab | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) |
+| m3ir5 | (none) | [values] | Annual (14 Yrs) | Annual (14 Yrs) |
+| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) |
+| mcl | (none) | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) |
+| mortgage | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) |
+| mw2008 | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) |
+| mw2014 | (none) | [values] | Annual (17 Yrs) | Annual (17 Yrs) |
+| prism | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) |
+| quarterly | (none) | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) |
+| raa | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) |
+| tail_sample | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
+| ukmotor | (none) | [values] | Annual (7 Yrs) | Annual (7 Yrs) |
+| usaa | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
+| usauto | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
+| xyz | (none) | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) |
diff --git a/scripts/regen_sample_data_docs.py b/scripts/regen_sample_data_docs.py
new file mode 100644
index 00000000..5c250c0a
--- /dev/null
+++ b/scripts/regen_sample_data_docs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+"""Regenerate docs/library/sample_data.md from the sample-dataset manifest.
+
+The sample-data documentation table used to be maintained by hand, which let it
+drift out of sync with the actual datasets (missing rows, typo'd names, wrong
+grain). It is now generated from the single source of truth,
+``chainladder/utils/data/_manifest.py``, by way of :func:`chainladder.list_samples`.
+
+Run from the repository root after adding or changing a sample dataset::
+
+    python scripts/regen_sample_data_docs.py
+
+The script overwrites ``docs/library/sample_data.md`` in place.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import chainladder as cl
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+DOCS_PATH = REPO_ROOT / "docs" / "library" / "sample_data.md"
+
+HEADER = """# Sample Dataset
+
+Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes.
+
+You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`.
+
+This table is generated from the sample-dataset manifest
+(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it,
+run `python scripts/regen_sample_data_docs.py` from the repository root.
+
+"""
+
+
+def _fmt_list(value) -> str:
+    """Render an index/columns cell."""
+    if value is None:
+        return "(none)"
+    return "[" + ", ".join(str(v) for v in value) + "]"
+
+
+# Unit noun used in the "(N units)" suffix on each grain label.
+_GRAIN_UNITS: dict = {
+    "Annual": "Yrs",
+    "Semiannual": "Half-Yrs",
+    "Quarter": "Qtrs",
+    "Month": "months",
+}
+
+
+def _fmt_grain(label: str, periods: int) -> str:
+    """Render a grain cell, e.g. 'Annual (10 Yrs)'."""
+    unit = _GRAIN_UNITS.get(label, label)
+    return f"{label} ({periods} {unit})"
+
+
+def build_table() -> str:
+    df = cl.list_samples()
+
+    rows = [
+        "| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |",
+        "|---|---|---|---|---|",
+    ]
+    for name, row in df.iterrows():
+        rows.append(
+            "| {name} | {index} | {columns} | {origin} | {development} |".format(
+                name=name,
+                index=_fmt_list(row["index"]),
+                columns=_fmt_list(row["columns"]),
+                origin=_fmt_grain(row["origin_grain"], row["origin_periods"]),
+                development=_fmt_grain(
+                    row["development_grain"], row["development_periods"]
+                ),
+            )
+        )
+    return "\n".join(rows) + "\n"
+
+
+def main() -> None:
+    content = HEADER + build_table()
+    DOCS_PATH.write_text(content)
+    print(f"Wrote {DOCS_PATH.relative_to(REPO_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()

From 577fc842b3d54325f2b5f8e50746ee391775db63 Mon Sep 17 00:00:00 2001
From: Nick Kinney <Nick.Kinney4@gmail.com>
Date: Sat, 30 May 2026 23:00:50 -0400
Subject: [PATCH 2/3] docs+test: wire list_samples into API ref, prove sdist
 completeness (#774)

Addresses review feedback on #886 from @henrydingliu:

- Add list_samples to docs/library/api.md autosummary and a matching
  generated stub so the new utility appears in the API reference. The
  reference is hand-maintained, not auto-discovered, so this is a manual
  add. Also dropped a pre-existing duplicate load_sample entry in the
  same autosummary block.
- Add test_sdist_ships_all_samples: builds a source distribution and
  asserts every sample CSV is present, guarding against MANIFEST.in
  drifting out of sync again. Self-skips when the build package or a
  source checkout is unavailable, so it stays out of the fast suite as
  Henry suggested.
---
 chainladder/utils/tests/test_utilities.py     | 47 +++++++++++++++++++
 docs/library/api.md                           |  2 +-
 .../generated/chainladder.list_samples.rst    |  6 +++
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 docs/library/generated/chainladder.list_samples.rst

diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py
index bc68abfc..f3608cc1 100644
--- a/chainladder/utils/tests/test_utilities.py
+++ b/chainladder/utils/tests/test_utilities.py
@@ -178,6 +178,53 @@ def test_list_samples() -> None:
     assert df.loc["prism", "cumulative"] == SAMPLES["prism"]["cumulative"]
 
 
+def test_sdist_ships_all_samples(tmp_path) -> None:
+    """
+    Build a source distribution and assert it contains every sample CSV.
+
+    This is the guard against MANIFEST.in drifting out of sync with the data
+    folder again (the bug behind #774: the old per-file include list shipped
+    only 22 of the bundled CSVs). It is deliberately self-skipping rather than
+    a hard requirement of the fast suite: it needs the ``build`` package and a
+    source checkout (a pyproject.toml at the repo root), and it shells out to a
+    full sdist build, so it no-ops in environments that lack either.
+    """
+    import subprocess
+    import sys
+    import tarfile
+
+    pytest.importorskip("build", reason="requires the build package")
+
+    # Locate the repo root (the directory containing pyproject.toml). When
+    # running from an installed wheel there is no source tree, so skip.
+    repo_root: Path = Path(__file__).resolve().parents[3]
+    if not (repo_root / "pyproject.toml").is_file():
+        pytest.skip("not running from a source checkout")
+
+    data_dir: Path = Path(__file__).parent.parent / "data"
+    expected_csvs = {f.name for f in data_dir.glob("*.csv")}
+
+    subprocess.run(
+        [sys.executable, "-m", "build", "--sdist", "--outdir", str(tmp_path)],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+    )
+
+    sdists = list(tmp_path.glob("*.tar.gz"))
+    assert len(sdists) == 1, f"expected one sdist, found {sdists}"
+
+    with tarfile.open(sdists[0]) as tar:
+        shipped = {
+            Path(name).name
+            for name in tar.getnames()
+            if "/utils/data/" in name and name.endswith(".csv")
+        }
+
+    missing = expected_csvs - shipped
+    assert not missing, f"sdist is missing sample CSVs: {sorted(missing)}"
+
+
 def test_load_sample_clrd2025() -> None:
     """
     Tests the clrd2025 sample (CAS Schedule P 1998-2007 refresh).
diff --git a/docs/library/api.md b/docs/library/api.md
index d0985fab..7565e421 100644
--- a/docs/library/api.md
+++ b/docs/library/api.md
@@ -165,10 +165,10 @@ Functions
    :template: function.rst
 
    load_sample
+   list_samples
    read_pickle
    read_json
    concat
-   load_sample
    minimum
    maximum
 
diff --git a/docs/library/generated/chainladder.list_samples.rst b/docs/library/generated/chainladder.list_samples.rst
new file mode 100644
index 00000000..2f430531
--- /dev/null
+++ b/docs/library/generated/chainladder.list_samples.rst
@@ -0,0 +1,6 @@
+﻿chainladder.list\_samples
+=========================
+
+.. currentmodule:: chainladder
+
+.. autofunction:: list_samples
\ No newline at end of file

From 724b45518cfebdbbaf51e2e6ee31fd08b3ac7e03 Mon Sep 17 00:00:00 2001
From: Nick Kinney <Nick.Kinney4@gmail.com>
Date: Sun, 31 May 2026 11:12:44 -0400
Subject: [PATCH 3/3] docs: convert sample_data to a notebook that calls
 list_samples (#774)

Per Henry's review on #886: replace the static sample_data.md table with
a notebook (sample_data.ipynb) that renders the table live via
cl.list_samples(), so the docs can never drift from the manifest. This
also retires the now-pointless scripts/regen_sample_data_docs.py and the
committed list_samples.rst autosummary stub (autosummary regenerates it
at build, and the experimental branch is removing these stubs anyway;
the api.md autosummary entry for list_samples stays).

The notebook keeps the '# Sample Dataset' H1 so the existing
sample_data.html#sample-dataset anchor used by the tutorials still
resolves. _toc.yml, the manifest module docstring, and the
utility_functions comment are updated to point at the notebook.
---
 chainladder/utils/data/_manifest.py           |  3 +-
 chainladder/utils/utility_functions.py        |  2 +-
 docs/_toc.yml                                 |  2 +-
 .../generated/chainladder.list_samples.rst    |  6 --
 docs/library/sample_data.ipynb                | 52 +++++++++++
 docs/library/sample_data.md                   | 58 ------------
 scripts/regen_sample_data_docs.py             | 88 -------------------
 7 files changed, 56 insertions(+), 155 deletions(-)
 delete mode 100644 docs/library/generated/chainladder.list_samples.rst
 create mode 100644 docs/library/sample_data.ipynb
 delete mode 100644 docs/library/sample_data.md
 delete mode 100644 scripts/regen_sample_data_docs.py

diff --git a/chainladder/utils/data/_manifest.py b/chainladder/utils/data/_manifest.py
index ff91a191..c0f65139 100644
--- a/chainladder/utils/data/_manifest.py
+++ b/chainladder/utils/data/_manifest.py
@@ -5,7 +5,8 @@
 
 * :func:`chainladder.load_sample` -- to build the ``Triangle`` for a sample.
 * :func:`chainladder.list_samples` -- to list available samples.
-* ``docs/library/sample_data.md`` -- regenerated from this registry.
+* ``docs/library/sample_data.ipynb`` -- renders the sample table live via
+  ``cl.list_samples()``.
 * ``MANIFEST.in`` -- ships ``chainladder/utils/data/*.csv`` via a wildcard.
 
 Adding a new sample dataset is a one-entry change here (plus dropping the
diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py
index cd0b892e..7e8ac606 100644
--- a/chainladder/utils/utility_functions.py
+++ b/chainladder/utils/utility_functions.py
@@ -109,7 +109,7 @@ def load_sample(key: str, *args, **kwargs) -> Triangle:
     # manifest (chainladder/utils/data/_manifest.py). The manifest is the
     # single source of truth for sample-dataset metadata, replacing the long
     # per-dataset if/elif chain that previously lived here and duplicated the
-    # column names already present in the tests and docs/library/sample_data.md.
+    # column names already present in the tests and the sample-data docs.
     config: dict = SAMPLES[key.lower()]
     origin = config["origin"]
     development = config["development"]
diff --git a/docs/_toc.yml b/docs/_toc.yml
index 6229fceb..b7c6a219 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -44,7 +44,7 @@ parts:
       sections:
       - file: library/usage.md
       - file: library/references.md
-      - file: library/sample_data.md
+      - file: library/sample_data.ipynb
       - file: library/glossary.md
       - file: library/questions_issues.md
       - file: library/contributing.md
diff --git a/docs/library/generated/chainladder.list_samples.rst b/docs/library/generated/chainladder.list_samples.rst
deleted file mode 100644
index 2f430531..00000000
--- a/docs/library/generated/chainladder.list_samples.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-﻿chainladder.list\_samples
-=========================
-
-.. currentmodule:: chainladder
-
-.. autofunction:: list_samples
\ No newline at end of file
diff --git a/docs/library/sample_data.ipynb b/docs/library/sample_data.ipynb
new file mode 100644
index 00000000..d983c656
--- /dev/null
+++ b/docs/library/sample_data.ipynb
@@ -0,0 +1,52 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sample Dataset\n",
+    "\n",
+    "The `chainladder` package ships with a number of sample datasets that are\n",
+    "used throughout the documentation and examples. Load any of them with\n",
+    "`cl.load_sample(...)`, for example `cl.load_sample(\"abc\")`.\n",
+    "\n",
+    "The table below lists every available dataset alongside its basic\n",
+    "attributes. It is produced live by `cl.list_samples()`, which reads from the\n",
+    "sample-dataset manifest (`chainladder/utils/data/_manifest.py`), so it never\n",
+    "drifts out of sync with the datasets that actually ship."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chainladder as cl\n",
+    "\n",
+    "cl.list_samples()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/library/sample_data.md b/docs/library/sample_data.md
deleted file mode 100644
index 1b8882f7..00000000
--- a/docs/library/sample_data.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Sample Dataset
-
-Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes.
-
-You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`.
-
-This table is generated from the sample-dataset manifest
-(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it,
-run `python scripts/regen_sample_data_docs.py` from the repository root.
-
-| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |
-|---|---|---|---|---|
-| abc | (none) | [values] | Annual (11 Yrs) | Annual (11 Yrs) |
-| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
-| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) |
-| cc_sample | (none) | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) |
-| clrd | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) |
-| clrd2025 | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (19 Yrs) | Annual (19 Yrs) |
-| friedland_auto_bi_insurer | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) |
-| friedland_auto_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) |
-| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
-| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
-| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) |
-| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) |
-| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) |
-| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) |
-| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_us_industry_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_uspp_auto_increasing_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_uspp_auto_increasing_claim | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_uspp_auto_steady_state | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_uspp_increasing_claim_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) |
-| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (8 Yrs) |
-| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) |
-| friedland_xyz_auto_bi | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
-| friedland_xyz_case | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) |
-| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) |
-| friedland_xyz_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) |
-| genins | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) |
-| ia_sample | (none) | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) |
-| liab | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) |
-| m3ir5 | (none) | [values] | Annual (14 Yrs) | Annual (14 Yrs) |
-| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) |
-| mcl | (none) | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) |
-| mortgage | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) |
-| mw2008 | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) |
-| mw2014 | (none) | [values] | Annual (17 Yrs) | Annual (17 Yrs) |
-| prism | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) |
-| quarterly | (none) | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) |
-| raa | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) |
-| tail_sample | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
-| ukmotor | (none) | [values] | Annual (7 Yrs) | Annual (7 Yrs) |
-| usaa | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
-| usauto | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) |
-| xyz | (none) | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) |
diff --git a/scripts/regen_sample_data_docs.py b/scripts/regen_sample_data_docs.py
deleted file mode 100644
index 5c250c0a..00000000
--- a/scripts/regen_sample_data_docs.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-"""Regenerate docs/library/sample_data.md from the sample-dataset manifest.
-
-The sample-data documentation table used to be maintained by hand, which let it
-drift out of sync with the actual datasets (missing rows, typo'd names, wrong
-grain). It is now generated from the single source of truth,
-``chainladder/utils/data/_manifest.py``, by way of :func:`chainladder.list_samples`.
-
-Run from the repository root after adding or changing a sample dataset::
-
-    python scripts/regen_sample_data_docs.py
-
-The script overwrites ``docs/library/sample_data.md`` in place.
-"""
-from __future__ import annotations
-
-from pathlib import Path
-
-import chainladder as cl
-
-REPO_ROOT = Path(__file__).resolve().parent.parent
-DOCS_PATH = REPO_ROOT / "docs" / "library" / "sample_data.md"
-
-HEADER = """# Sample Dataset
-
-Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes.
-
-You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`.
-
-This table is generated from the sample-dataset manifest
-(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it,
-run `python scripts/regen_sample_data_docs.py` from the repository root.
-
-"""
-
-
-def _fmt_list(value) -> str:
-    """Render an index/columns cell."""
-    if value is None:
-        return "(none)"
-    return "[" + ", ".join(str(v) for v in value) + "]"
-
-
-# Unit noun used in the "(N units)" suffix on each grain label.
-_GRAIN_UNITS: dict = {
-    "Annual": "Yrs",
-    "Semiannual": "Half-Yrs",
-    "Quarter": "Qtrs",
-    "Month": "months",
-}
-
-
-def _fmt_grain(label: str, periods: int) -> str:
-    """Render a grain cell, e.g. 'Annual (10 Yrs)'."""
-    unit = _GRAIN_UNITS.get(label, label)
-    return f"{label} ({periods} {unit})"
-
-
-def build_table() -> str:
-    df = cl.list_samples()
-
-    rows = [
-        "| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |",
-        "|---|---|---|---|---|",
-    ]
-    for name, row in df.iterrows():
-        rows.append(
-            "| {name} | {index} | {columns} | {origin} | {development} |".format(
-                name=name,
-                index=_fmt_list(row["index"]),
-                columns=_fmt_list(row["columns"]),
-                origin=_fmt_grain(row["origin_grain"], row["origin_periods"]),
-                development=_fmt_grain(
-                    row["development_grain"], row["development_periods"]
-                ),
-            )
-        )
-    return "\n".join(rows) + "\n"
-
-
-def main() -> None:
-    content = HEADER + build_table()
-    DOCS_PATH.write_text(content)
-    print(f"Wrote {DOCS_PATH.relative_to(REPO_ROOT)}")
-
-
-if __name__ == "__main__":
-    main()