From a608eb4da8995715285c889bc90d0f9d8dd6b823 Mon Sep 17 00:00:00 2001 From: Nick Kinney Date: Sat, 30 May 2026 20:03:29 -0400 Subject: [PATCH 1/3] feat(data): centralize sample-dataset metadata in a manifest (#774) Sample-dataset metadata was duplicated across four places: the long if/elif chain in load_sample, the tests, docs/library/sample_data.md, and the per-file include list in MANIFEST.in. Adding a sample meant editing all four by hand, and they had already drifted (sample_data.md listed 23 of 46 datasets with several typo-d names; MANIFEST.in listed 22, so the sdist shipped incomplete). Introduce chainladder/utils/data/_manifest.py as the single source of truth (a plain Python dict, no new dependency), and key everything off it: - load_sample now looks up its Triangle config from the manifest instead of the if/elif chain. Verified behavior-preserving: the resolved origin/development/index/columns/cumulative for all 46 bundled samples is byte-identical before and after. - New public cl.list_samples() returns a DataFrame of name, index, columns, cumulative, and (optionally) grain + period counts. Doubles as the source for the docs table. - test_load_sample iterates the manifest rather than globbing the data directory, so adding a sample is a one-entry change and stray non-CSV files cannot be mistaken for datasets. Added a both-ways sync assertion (manifest == CSVs on disk) and a test_list_samples test. - MANIFEST.in collapses the 22 hand-listed includes to one recursive-include chainladder/utils/data *.csv. Verified the built sdist now contains all 46 CSVs (was 22). - docs/library/sample_data.md is regenerated from cl.list_samples() via scripts/regen_sample_data_docs.py; the table is now complete and accurate. Closes #774. --- MANIFEST.in | 23 +- chainladder/utils/__init__.py | 1 + chainladder/utils/data/_manifest.py | 355 ++++++++++++++++++++++ chainladder/utils/tests/test_utilities.py | 55 +++- chainladder/utils/utility_functions.py | 246 ++++++--------- docs/library/sample_data.md | 101 +++--- scripts/regen_sample_data_docs.py | 88 ++++++ 7 files changed, 623 insertions(+), 246 deletions(-) create mode 100644 chainladder/utils/data/_manifest.py create mode 100644 scripts/regen_sample_data_docs.py diff --git a/MANIFEST.in b/MANIFEST.in index 0eed2f16..b0773a5f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,25 +2,4 @@ include LICENSE include requirements.txt include README.rst -include chainladder/utils/data/abc.csv -include chainladder/utils/data/auto.csv -include chainladder/utils/data/clrd.csv -include chainladder/utils/data/clrd2025.csv -include chainladder/utils/data/genins.csv -include chainladder/utils/data/liab.csv -include chainladder/utils/data/m3ir5.csv -include chainladder/utils/data/mcl.csv -include chainladder/utils/data/mortgage.csv -include chainladder/utils/data/mw2008.csv -include chainladder/utils/data/mw2014.csv -include chainladder/utils/data/quarterly.csv -include chainladder/utils/data/raa.csv -include chainladder/utils/data/ukmotor.csv -include chainladder/utils/data/usaa.csv -include chainladder/utils/data/usauto.csv -include chainladder/utils/data/cc_sample.csv -include chainladder/utils/data/ia_sample.csv -include chainladder/utils/data/prism.csv -include chainladder/utils/data/tail_sample.csv -include chainladder/utils/data/berqsherm.csv -include chainladder/utils/data/xyz.csv \ No newline at end of file +recursive-include chainladder/utils/data *.csv diff --git a/chainladder/utils/__init__.py b/chainladder/utils/__init__.py index 1800834f..3f665072 100644 --- a/chainladder/utils/__init__.py +++ b/chainladder/utils/__init__.py @@ -12,6 +12,7 @@ read_json, concat, load_sample, + list_samples, minimum, maximum, PatsyFormula, diff --git a/chainladder/utils/data/_manifest.py b/chainladder/utils/data/_manifest.py new file mode 100644 index 00000000..ff91a191 --- /dev/null +++ b/chainladder/utils/data/_manifest.py @@ -0,0 +1,355 @@ +"""Central registry of bundled sample datasets. + +Single source of truth for the metadata of every CSV in +``chainladder/utils/data/``. Consumed by: + +* :func:`chainladder.load_sample` -- to build the ``Triangle`` for a sample. +* :func:`chainladder.list_samples` -- to list available samples. +* ``docs/library/sample_data.md`` -- regenerated from this registry. +* ``MANIFEST.in`` -- ships ``chainladder/utils/data/*.csv`` via a wildcard. + +Adding a new sample dataset is a one-entry change here (plus dropping the +CSV in ``chainladder/utils/data/``); ``load_sample``, the docs table, and the +tests all key off this dict, so the metadata no longer has to be repeated in +three places. + +Each entry maps the sample name (the CSV filename without extension, lower +case) to the keyword arguments passed to ``Triangle``: + +``origin`` + Column name(s) for the origin period. +``development`` + Column name(s) for the development period. +``index`` + Column name(s) used as the Triangle index, or ``None``. +``columns`` + Measure column name(s) loaded into the Triangle. +``cumulative`` + ``True`` if the measures are cumulative, ``False`` if incremental. +""" + +SAMPLES: dict = { + + "abc": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "auto": { + "origin": 'origin', + "development": 'development', + "index": ['lob'], + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "berqsherm": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['LOB'], + "columns": ['Incurred', 'Paid', 'Reported', 'Closed'], + "cumulative": True, + }, + "cc_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['loss', 'exposure'], + "cumulative": True, + }, + "clrd": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['GRNAME', 'LOB'], + "columns": ['IncurLoss', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'], + "cumulative": True, + }, + "clrd2025": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['GRNAME', 'LOB'], + "columns": ['IncurredLosses', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'], + "cumulative": True, + }, + "friedland_auto_bi_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_auto_freq_sev": { + "origin": 'Accident Half-Year', + "development": 'Calendar Half-Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severity'], + "cumulative": True, + }, + "friedland_auto_salsub": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Salvage and Subrogation', 'Received Salvage and Subrogation', 'Reported Claims', 'Paid Claims'], + "cumulative": True, + }, + "friedland_autoprop": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported ALAE', 'Paid ALAE', 'Reported Claims', 'Paid Claims'], + "cumulative": True, + }, + "friedland_berq_sher_auto": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate'], + "cumulative": True, + }, + "friedland_gl_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate', 'Paid Claims'], + "cumulative": True, + }, + "friedland_med_mal": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Case Outstanding', 'Open Claim Counts'], + "cumulative": True, + }, + "friedland_qs": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Net to Gross'], + "cumulative": True, + }, + "friedland_us_auto_chg_prod_mix": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_auto_incr_claim": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_auto_steady_state": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_industry_auto": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_industry_auto_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Outstanding', 'Paid Claims'], + "cumulative": True, + }, + "friedland_uspp_auto_increasing_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_auto_increasing_claim": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_auto_steady_state": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_increasing_claim_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_wc_self_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Paid Claims', 'Paid Severities', 'Reported Claims', 'Reported Severities'], + "cumulative": True, + }, + "friedland_xol": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Ceded Reported Claims'], + "cumulative": True, + }, + "friedland_xyz_auto_bi": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_xyz_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Outstanding', 'Paid Claims'], + "cumulative": True, + }, + "friedland_xyz_disp": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Disposal Rate', 'Closed Claim Counts', 'Paid Claims'], + "cumulative": True, + }, + "friedland_xyz_freq_sev": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severities'], + "cumulative": True, + }, + "genins": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "ia_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['loss', 'exposure'], + "cumulative": True, + }, + "liab": { + "origin": 'origin', + "development": 'development', + "index": ['lob'], + "columns": ['values'], + "cumulative": True, + }, + "m3ir5": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mack_1997": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Incurred'], + "cumulative": True, + }, + "mcl": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "mortgage": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mw2008": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mw2014": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "prism": { + "origin": 'AccidentDate', + "development": 'PaymentDate', + "index": ['ClaimNo', 'Line', 'Type', 'ClaimLiability', 'Limit', 'Deductible'], + "columns": ['reportedCount', 'closedPaidCount', 'Paid', 'Incurred'], + "cumulative": False, + }, + "quarterly": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "raa": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "tail_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "ukmotor": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "usaa": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "usauto": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "xyz": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": None, + "columns": ['Incurred', 'Paid', 'Reported', 'Closed', 'Premium'], + "cumulative": True, + }, +} diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py index 0aeffd27..bc68abfc 100644 --- a/chainladder/utils/tests/test_utilities.py +++ b/chainladder/utils/tests/test_utilities.py @@ -8,6 +8,7 @@ __dt64_unit__ ) from chainladder.utils.utility_functions import date_delta_adjustment +from chainladder.utils.data._manifest import SAMPLES from pathlib import Path @@ -130,23 +131,51 @@ def test_invalid_sample() -> None: def test_load_sample() -> None: """ - Tests whether the supported sample data sets load. - """ + Tests whether every sample data set declared in the manifest loads. - # Get the folder containing the datasets. - data_dir: Path = Path(__file__).parent.parent / 'data' + Iterating over the manifest (rather than globbing the data directory) + means adding a new sample is a one-entry change in + ``chainladder/utils/data/_manifest.py`` and this test picks it up + automatically, while non-data files in the folder (``__init__.py``, + ``_manifest.py``) are never mistaken for datasets. + """ + # Every manifest entry must load and have a matching CSV on disk. + data_dir: Path = Path(__file__).parent.parent / "data" + for dataset in SAMPLES: + assert (data_dir / f"{dataset}.csv").is_file(), ( + f"manifest lists '{dataset}' but {dataset}.csv is missing" + ) + cl.load_sample(dataset) - # Files to exclude from cl.load_sample(). - files_to_excl: list = [ - '__init__' - ] + # Conversely, every CSV on disk must be declared in the manifest, so a + # newly added data file can't silently go unregistered. + csv_stems = {f.stem for f in data_dir.glob("*.csv")} + assert csv_stems == set(SAMPLES), ( + "manifest and data directory are out of sync: " + f"only in dir={csv_stems - set(SAMPLES)}, " + f"only in manifest={set(SAMPLES) - csv_stems}" + ) - # Gather list of files to test. - datasets = [f.stem for f in data_dir.iterdir() if f.is_file() and f.stem not in files_to_excl] - # Load each file. - for dataset in datasets: - cl.load_sample(dataset) +def test_list_samples() -> None: + """ + Tests cl.list_samples(): the manifest-driven catalog of bundled datasets. + """ + df = cl.list_samples() + # One row per manifest entry, indexed by sample name. + assert df.index.name == "name" + assert set(df.index) == set(SAMPLES) + assert {"index", "columns", "cumulative", "origin_grain", "development_grain"} <= set(df.columns) + + # The fast path skips loading data and therefore omits the grain columns. + fast = cl.list_samples(include_grain=False) + assert set(fast.index) == set(SAMPLES) + assert "origin_grain" not in fast.columns + assert "development_grain" not in fast.columns + + # Metadata matches the manifest source of truth. + assert df.loc["clrd2025", "columns"] == SAMPLES["clrd2025"]["columns"] + assert df.loc["prism", "cumulative"] == SAMPLES["prism"]["cumulative"] def test_load_sample_clrd2025() -> None: diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py index a124a37f..cd0b892e 100644 --- a/chainladder/utils/utility_functions.py +++ b/chainladder/utils/utility_functions.py @@ -15,6 +15,7 @@ __dt64_dtype__ ) from chainladder.utils.sparse import sp +from chainladder.utils.data._manifest import SAMPLES from io import StringIO from patsy import dmatrix # noqa from sklearn.base import BaseEstimator, TransformerMixin @@ -43,7 +44,7 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: Datasets that are commonly used in examples are: raa, clrd, and prism. - And a complete list of available datasets is: abc, auto, berqsherm, cc_sample, clrd, clrd2025, genins, ia_sample, liab, m3ir5, mack_1997, mcl, mortgage, mw2008, mw2014, prism, quarterly, raa, tail_sample, ukmotor, usaa, usauto, xyz. + For the complete list of available datasets, call :func:`list_samples`. Returns ------- @@ -86,10 +87,10 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: # Set base path to be the parent directory of this file, e.g., the utils folder. utils_path: AnyStr = os.path.dirname(os.path.abspath(__file__)) - # Validate that the file indicated by the key argument exists. - dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv") - - if not os.path.exists(dataset_path): + # Validate the key against the sample-dataset manifest. The manifest is the + # authoritative list of available samples; every entry has a matching CSV in + # the data folder. + if key.lower() not in SAMPLES: raise ValueError( """ Invalid key supplied. The key should match the name, without extension, of one of the file names @@ -102,162 +103,19 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: ) ) - # Set initial values for arguments to Triangle __init__. These may be overridden by - # values specific to the data set. - origin: str = "origin" - development: str = "development" - columns: list = ["values"] - index: list | None = None - cumulative: bool = True - - if key.lower() in ["mcl", "usaa", "quarterly", "auto", "usauto", "tail_sample"]: - columns: list = ["incurred", "paid"] - if key.lower() == "clrd": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["GRNAME", "LOB"] - columns: list = [ - "IncurLoss", - "CumPaidLoss", - "BulkLoss", - "EarnedPremDIR", - "EarnedPremCeded", - "EarnedPremNet", - ] - if key.lower() == "clrd2025": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["GRNAME", "LOB"] - columns: list = [ - "IncurredLosses", - "CumPaidLoss", - "BulkLoss", - "EarnedPremDIR", - "EarnedPremCeded", - "EarnedPremNet", - ] - if key.lower() == "berqsherm": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["LOB"] - columns: list = ["Incurred", "Paid", "Reported", "Closed"] - if key.lower() == "xyz": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - columns: list = ["Incurred", "Paid", "Reported", "Closed", "Premium"] - if key.lower() in ["liab", "auto"]: - index: list = ["lob"] - if key.lower() in ["cc_sample", "ia_sample"]: - columns: list = ["loss", "exposure"] - if key.lower() in ["prism"]: - columns: list = ["reportedCount", "closedPaidCount", "Paid", "Incurred"] - index: list = [ - "ClaimNo", - "Line", - "Type", - "ClaimLiability", - "Limit", - "Deductible", - ] - origin: str = "AccidentDate" - development: str = "PaymentDate" - cumulative: bool = False - if "mack_1997" in key.lower(): - columns = ["Case Incurred"] - origin = "Accident Year" - development = "Calendar Year" - cumulative: bool = True - # Friedland datasets - if "friedland" in key.lower(): - columns: list = ["Paid Claims", "Reported Claims"] - origin: str = "Accident Year" - development: str = "Calendar Year" - cumulative: bool = True - index: None = None - if "autoprop" in key.lower(): - columns: list = [ - "Reported ALAE", - "Paid ALAE", - "Reported Claims", - "Paid Claims", - ] - if "auto_salsub" in key.lower(): - columns: list = [ - "Reported Salvage and Subrogation", - "Received Salvage and Subrogation", - "Reported Claims", - "Paid Claims", - ] - if "berq_sher_auto" in key.lower(): - columns: list = [ - "Paid Claims", - "Closed Claim Counts", - "Reported Claim Counts", - "Disposal Rate", - ] - if "gl_insurer" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Disposal Rate", - "Paid Claims", - ] - if "med_mal" in key.lower(): - columns: list = [ - "Reported Claims", - "Paid Claims", - "Case Outstanding", - "Open Claim Counts", - ] - if "qs" in key.lower(): - columns: list = [ - "Gross Reported Claims", - "Net Reported Claims", - "Net to Gross", - ] - if "auto_case" in key.lower(): - columns: list = ["Case Outstanding", "Paid Claims"] - if "wc_self_insurer" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Paid Claims", - "Paid Severities", - "Reported Claims", - "Reported Severities", - ] - if "xol" in key.lower(): - columns: list = [ - "Gross Reported Claims", - "Net Reported Claims", - "Ceded Reported Claims", - ] - if "xyz_case" in key.lower(): - columns: list = ["Case Outstanding", "Paid Claims"] - if "xyz_disp" in key.lower(): - columns: list = ["Disposal Rate", "Closed Claim Counts", "Paid Claims"] - if "xyz_freq_sev" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Reported Claims", - "Reported Severities", - ] - if "auto_freq_sev" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Reported Claims", - "Reported Severity", - ] - origin: str = "Accident Half-Year" - development: str = "Calendar Half-Year" - if "uspp" in key.lower(): - columns: list = [ - "Reported Claims", - "Paid Claims", - "Earned Premium" - ] + dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv") + + # Look up the Triangle configuration for this sample from the central + # manifest (chainladder/utils/data/_manifest.py). The manifest is the + # single source of truth for sample-dataset metadata, replacing the long + # per-dataset if/elif chain that previously lived here and duplicated the + # column names already present in the tests and docs/library/sample_data.md. + config: dict = SAMPLES[key.lower()] + origin = config["origin"] + development = config["development"] + index = config["index"] + columns = config["columns"] + cumulative = config["cumulative"] df = pd.read_csv(filepath_or_buffer=dataset_path) @@ -273,6 +131,72 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: ) +# Human-readable labels for the single-character grain codes a Triangle exposes +# via ``origin_grain`` / ``development_grain``. +_GRAIN_LABELS: dict = { + "Y": "Annual", + "S": "Semiannual", + "Q": "Quarter", + "M": "Month", +} + + +def list_samples(include_grain: bool = True) -> DataFrame: + """List the sample datasets bundled with the chainladder package. + + The returned table is driven by the sample-dataset manifest + (``chainladder/utils/data/_manifest.py``), the same source + :func:`load_sample` reads, so it always reflects exactly what is loadable. + + Parameters + ---------- + include_grain: bool + If ``True`` (default), load each sample to report its origin and + development grain (and the number of origin/development periods). This + is the slower path because every Triangle is built. Set to ``False`` to + return just the manifest metadata (name, index, columns, cumulative) + without loading any data. + + Returns + ------- + pandas.DataFrame indexed by sample name, with columns ``index``, + ``columns``, ``cumulative`` and, when ``include_grain`` is ``True``, + ``origin_grain``, ``development_grain``, ``origin_periods`` and + ``development_periods``. + + Examples + -------- + + .. code-block:: python + + import chainladder as cl + cl.list_samples() # full table, grain included + cl.list_samples(include_grain=False) # fast, metadata only + """ + records: list = [] + for name in sorted(SAMPLES): + config: dict = SAMPLES[name] + record: dict = { + "name": name, + "index": config["index"], + "columns": config["columns"], + "cumulative": config["cumulative"], + } + if include_grain: + triangle = load_sample(name) + record["origin_grain"] = _GRAIN_LABELS.get( + triangle.origin_grain, triangle.origin_grain + ) + record["development_grain"] = _GRAIN_LABELS.get( + triangle.development_grain, triangle.development_grain + ) + record["origin_periods"] = len(triangle.origin) + record["development_periods"] = triangle.development.shape[0] + records.append(record) + + return pd.DataFrame.from_records(records).set_index("name") + + def read_pickle(path): with open(path, "rb") as pkl: return dill.load(pkl) diff --git a/docs/library/sample_data.md b/docs/library/sample_data.md index b6a7dc07..1b8882f7 100644 --- a/docs/library/sample_data.md +++ b/docs/library/sample_data.md @@ -4,54 +4,55 @@ Below is the list of all datasets that come included with the `chainladder` pack You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`. -\* Denotes datasets that are more interesting and possess unique characteristics. +This table is generated from the sample-dataset manifest +(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it, +run `python scripts/regen_sample_data_docs.py` from the repository root. - -| Dataset Name | Indexes | Columns | Origin Grain | Development Grain | -|----------------------------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------|--------------------------| -| abc | (none) | (none) | Annual (11 Yrs) | Annual (11 Yrs) | -| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) | -| cc_sample | [Total] | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) | -| clrd * | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | -| clrd2025 * | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_auto_bi_insurer* | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) | -| friedland_auto_freq_sev* | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) | -| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industy_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_incr_case | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| frieldand_uspp_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_incr_claim_case | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (10 Yrs) | -| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_xyz_auto_bi* | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_case* | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_xyz_freq_sev* | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) | -| genins | (none) | (none) | Annual (10 Yrs) | Annual (10 Yrs) | -| ia_sample | [Total] | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) | -| liab * | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) | -| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) | -| m3ir5 | (none) | (none) | Annual (14 Yrs) | Annual (14 Yrs) | -| mcl | [Total] | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) | -| mortgage | (none) | (none) | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2008 | (none) | (none) | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2014 | (none) | (none) | Annual (17 Yrs) | Annual (17 Yrs) | -| prism * | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) | -| quarterly * | [Total] | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) | -| raa | (none) | (none) | Annual (10 Yrs) | Annual (10 Yrs) | -| tail_sample | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| ukmotor | (none) | (none) | Annual (7 Yrs) | Annual (7 Yrs) | -| usaa | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| usauto | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| xyz* | [Total] | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) | +| Dataset Name | Indexes | Columns | Origin Grain | Development Grain | +|---|---|---|---|---| +| abc | (none) | [values] | Annual (11 Yrs) | Annual (11 Yrs) | +| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | +| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) | +| cc_sample | (none) | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) | +| clrd | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | +| clrd2025 | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (19 Yrs) | Annual (19 Yrs) | +| friedland_auto_bi_insurer | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) | +| friedland_auto_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) | +| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | +| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | +| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) | +| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | +| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) | +| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) | +| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_us_industry_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_uspp_auto_increasing_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_uspp_auto_increasing_claim | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_uspp_auto_steady_state | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_uspp_increasing_claim_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | +| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (8 Yrs) | +| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) | +| friedland_xyz_auto_bi | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) | +| friedland_xyz_case | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | +| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | +| friedland_xyz_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) | +| genins | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) | +| ia_sample | (none) | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) | +| liab | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) | +| m3ir5 | (none) | [values] | Annual (14 Yrs) | Annual (14 Yrs) | +| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) | +| mcl | (none) | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) | +| mortgage | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) | +| mw2008 | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) | +| mw2014 | (none) | [values] | Annual (17 Yrs) | Annual (17 Yrs) | +| prism | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) | +| quarterly | (none) | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) | +| raa | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) | +| tail_sample | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | +| ukmotor | (none) | [values] | Annual (7 Yrs) | Annual (7 Yrs) | +| usaa | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | +| usauto | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | +| xyz | (none) | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) | diff --git a/scripts/regen_sample_data_docs.py b/scripts/regen_sample_data_docs.py new file mode 100644 index 00000000..5c250c0a --- /dev/null +++ b/scripts/regen_sample_data_docs.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +"""Regenerate docs/library/sample_data.md from the sample-dataset manifest. + +The sample-data documentation table used to be maintained by hand, which let it +drift out of sync with the actual datasets (missing rows, typo'd names, wrong +grain). It is now generated from the single source of truth, +``chainladder/utils/data/_manifest.py``, by way of :func:`chainladder.list_samples`. + +Run from the repository root after adding or changing a sample dataset:: + + python scripts/regen_sample_data_docs.py + +The script overwrites ``docs/library/sample_data.md`` in place. +""" +from __future__ import annotations + +from pathlib import Path + +import chainladder as cl + +REPO_ROOT = Path(__file__).resolve().parent.parent +DOCS_PATH = REPO_ROOT / "docs" / "library" / "sample_data.md" + +HEADER = """# Sample Dataset + +Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes. + +You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`. + +This table is generated from the sample-dataset manifest +(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it, +run `python scripts/regen_sample_data_docs.py` from the repository root. + +""" + + +def _fmt_list(value) -> str: + """Render an index/columns cell.""" + if value is None: + return "(none)" + return "[" + ", ".join(str(v) for v in value) + "]" + + +# Unit noun used in the "(N units)" suffix on each grain label. +_GRAIN_UNITS: dict = { + "Annual": "Yrs", + "Semiannual": "Half-Yrs", + "Quarter": "Qtrs", + "Month": "months", +} + + +def _fmt_grain(label: str, periods: int) -> str: + """Render a grain cell, e.g. 'Annual (10 Yrs)'.""" + unit = _GRAIN_UNITS.get(label, label) + return f"{label} ({periods} {unit})" + + +def build_table() -> str: + df = cl.list_samples() + + rows = [ + "| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |", + "|---|---|---|---|---|", + ] + for name, row in df.iterrows(): + rows.append( + "| {name} | {index} | {columns} | {origin} | {development} |".format( + name=name, + index=_fmt_list(row["index"]), + columns=_fmt_list(row["columns"]), + origin=_fmt_grain(row["origin_grain"], row["origin_periods"]), + development=_fmt_grain( + row["development_grain"], row["development_periods"] + ), + ) + ) + return "\n".join(rows) + "\n" + + +def main() -> None: + content = HEADER + build_table() + DOCS_PATH.write_text(content) + print(f"Wrote {DOCS_PATH.relative_to(REPO_ROOT)}") + + +if __name__ == "__main__": + main() From 577fc842b3d54325f2b5f8e50746ee391775db63 Mon Sep 17 00:00:00 2001 From: Nick Kinney Date: Sat, 30 May 2026 23:00:50 -0400 Subject: [PATCH 2/3] docs+test: wire list_samples into API ref, prove sdist completeness (#774) Addresses review feedback on #886 from @henrydingliu: - Add list_samples to docs/library/api.md autosummary and a matching generated stub so the new utility appears in the API reference. The reference is hand-maintained, not auto-discovered, so this is a manual add. Also dropped a pre-existing duplicate load_sample entry in the same autosummary block. - Add test_sdist_ships_all_samples: builds a source distribution and asserts every sample CSV is present, guarding against MANIFEST.in drifting out of sync again. Self-skips when the build package or a source checkout is unavailable, so it stays out of the fast suite as Henry suggested. --- chainladder/utils/tests/test_utilities.py | 47 +++++++++++++++++++ docs/library/api.md | 2 +- .../generated/chainladder.list_samples.rst | 6 +++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 docs/library/generated/chainladder.list_samples.rst diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py index bc68abfc..f3608cc1 100644 --- a/chainladder/utils/tests/test_utilities.py +++ b/chainladder/utils/tests/test_utilities.py @@ -178,6 +178,53 @@ def test_list_samples() -> None: assert df.loc["prism", "cumulative"] == SAMPLES["prism"]["cumulative"] +def test_sdist_ships_all_samples(tmp_path) -> None: + """ + Build a source distribution and assert it contains every sample CSV. + + This is the guard against MANIFEST.in drifting out of sync with the data + folder again (the bug behind #774: the old per-file include list shipped + only 22 of the bundled CSVs). It is deliberately self-skipping rather than + a hard requirement of the fast suite: it needs the ``build`` package and a + source checkout (a pyproject.toml at the repo root), and it shells out to a + full sdist build, so it no-ops in environments that lack either. + """ + import subprocess + import sys + import tarfile + + pytest.importorskip("build", reason="requires the build package") + + # Locate the repo root (the directory containing pyproject.toml). When + # running from an installed wheel there is no source tree, so skip. + repo_root: Path = Path(__file__).resolve().parents[3] + if not (repo_root / "pyproject.toml").is_file(): + pytest.skip("not running from a source checkout") + + data_dir: Path = Path(__file__).parent.parent / "data" + expected_csvs = {f.name for f in data_dir.glob("*.csv")} + + subprocess.run( + [sys.executable, "-m", "build", "--sdist", "--outdir", str(tmp_path)], + cwd=repo_root, + check=True, + capture_output=True, + ) + + sdists = list(tmp_path.glob("*.tar.gz")) + assert len(sdists) == 1, f"expected one sdist, found {sdists}" + + with tarfile.open(sdists[0]) as tar: + shipped = { + Path(name).name + for name in tar.getnames() + if "/utils/data/" in name and name.endswith(".csv") + } + + missing = expected_csvs - shipped + assert not missing, f"sdist is missing sample CSVs: {sorted(missing)}" + + def test_load_sample_clrd2025() -> None: """ Tests the clrd2025 sample (CAS Schedule P 1998-2007 refresh). diff --git a/docs/library/api.md b/docs/library/api.md index d0985fab..7565e421 100644 --- a/docs/library/api.md +++ b/docs/library/api.md @@ -165,10 +165,10 @@ Functions :template: function.rst load_sample + list_samples read_pickle read_json concat - load_sample minimum maximum diff --git a/docs/library/generated/chainladder.list_samples.rst b/docs/library/generated/chainladder.list_samples.rst new file mode 100644 index 00000000..2f430531 --- /dev/null +++ b/docs/library/generated/chainladder.list_samples.rst @@ -0,0 +1,6 @@ +chainladder.list\_samples +========================= + +.. currentmodule:: chainladder + +.. autofunction:: list_samples \ No newline at end of file From 724b45518cfebdbbaf51e2e6ee31fd08b3ac7e03 Mon Sep 17 00:00:00 2001 From: Nick Kinney Date: Sun, 31 May 2026 11:12:44 -0400 Subject: [PATCH 3/3] docs: convert sample_data to a notebook that calls list_samples (#774) Per Henry's review on #886: replace the static sample_data.md table with a notebook (sample_data.ipynb) that renders the table live via cl.list_samples(), so the docs can never drift from the manifest. This also retires the now-pointless scripts/regen_sample_data_docs.py and the committed list_samples.rst autosummary stub (autosummary regenerates it at build, and the experimental branch is removing these stubs anyway; the api.md autosummary entry for list_samples stays). The notebook keeps the '# Sample Dataset' H1 so the existing sample_data.html#sample-dataset anchor used by the tutorials still resolves. _toc.yml, the manifest module docstring, and the utility_functions comment are updated to point at the notebook. --- chainladder/utils/data/_manifest.py | 3 +- chainladder/utils/utility_functions.py | 2 +- docs/_toc.yml | 2 +- .../generated/chainladder.list_samples.rst | 6 -- docs/library/sample_data.ipynb | 52 +++++++++++ docs/library/sample_data.md | 58 ------------ scripts/regen_sample_data_docs.py | 88 ------------------- 7 files changed, 56 insertions(+), 155 deletions(-) delete mode 100644 docs/library/generated/chainladder.list_samples.rst create mode 100644 docs/library/sample_data.ipynb delete mode 100644 docs/library/sample_data.md delete mode 100644 scripts/regen_sample_data_docs.py diff --git a/chainladder/utils/data/_manifest.py b/chainladder/utils/data/_manifest.py index ff91a191..c0f65139 100644 --- a/chainladder/utils/data/_manifest.py +++ b/chainladder/utils/data/_manifest.py @@ -5,7 +5,8 @@ * :func:`chainladder.load_sample` -- to build the ``Triangle`` for a sample. * :func:`chainladder.list_samples` -- to list available samples. -* ``docs/library/sample_data.md`` -- regenerated from this registry. +* ``docs/library/sample_data.ipynb`` -- renders the sample table live via + ``cl.list_samples()``. * ``MANIFEST.in`` -- ships ``chainladder/utils/data/*.csv`` via a wildcard. Adding a new sample dataset is a one-entry change here (plus dropping the diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py index cd0b892e..7e8ac606 100644 --- a/chainladder/utils/utility_functions.py +++ b/chainladder/utils/utility_functions.py @@ -109,7 +109,7 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: # manifest (chainladder/utils/data/_manifest.py). The manifest is the # single source of truth for sample-dataset metadata, replacing the long # per-dataset if/elif chain that previously lived here and duplicated the - # column names already present in the tests and docs/library/sample_data.md. + # column names already present in the tests and the sample-data docs. config: dict = SAMPLES[key.lower()] origin = config["origin"] development = config["development"] diff --git a/docs/_toc.yml b/docs/_toc.yml index 6229fceb..b7c6a219 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -44,7 +44,7 @@ parts: sections: - file: library/usage.md - file: library/references.md - - file: library/sample_data.md + - file: library/sample_data.ipynb - file: library/glossary.md - file: library/questions_issues.md - file: library/contributing.md diff --git a/docs/library/generated/chainladder.list_samples.rst b/docs/library/generated/chainladder.list_samples.rst deleted file mode 100644 index 2f430531..00000000 --- a/docs/library/generated/chainladder.list_samples.rst +++ /dev/null @@ -1,6 +0,0 @@ -chainladder.list\_samples -========================= - -.. currentmodule:: chainladder - -.. autofunction:: list_samples \ No newline at end of file diff --git a/docs/library/sample_data.ipynb b/docs/library/sample_data.ipynb new file mode 100644 index 00000000..d983c656 --- /dev/null +++ b/docs/library/sample_data.ipynb @@ -0,0 +1,52 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample Dataset\n", + "\n", + "The `chainladder` package ships with a number of sample datasets that are\n", + "used throughout the documentation and examples. Load any of them with\n", + "`cl.load_sample(...)`, for example `cl.load_sample(\"abc\")`.\n", + "\n", + "The table below lists every available dataset alongside its basic\n", + "attributes. It is produced live by `cl.list_samples()`, which reads from the\n", + "sample-dataset manifest (`chainladder/utils/data/_manifest.py`), so it never\n", + "drifts out of sync with the datasets that actually ship." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import chainladder as cl\n", + "\n", + "cl.list_samples()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/library/sample_data.md b/docs/library/sample_data.md deleted file mode 100644 index 1b8882f7..00000000 --- a/docs/library/sample_data.md +++ /dev/null @@ -1,58 +0,0 @@ -# Sample Dataset - -Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes. - -You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`. - -This table is generated from the sample-dataset manifest -(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it, -run `python scripts/regen_sample_data_docs.py` from the repository root. - -| Dataset Name | Indexes | Columns | Origin Grain | Development Grain | -|---|---|---|---|---| -| abc | (none) | [values] | Annual (11 Yrs) | Annual (11 Yrs) | -| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) | -| cc_sample | (none) | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) | -| clrd | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | -| clrd2025 | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (19 Yrs) | Annual (19 Yrs) | -| friedland_auto_bi_insurer | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) | -| friedland_auto_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) | -| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industry_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_increasing_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_increasing_claim | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_steady_state | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_increasing_claim_case | (none) | [Reported Claims, Paid Claims, Earned Premium] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_xyz_auto_bi | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_case | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_xyz_freq_sev | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) | -| genins | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) | -| ia_sample | (none) | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) | -| liab | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) | -| m3ir5 | (none) | [values] | Annual (14 Yrs) | Annual (14 Yrs) | -| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) | -| mcl | (none) | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) | -| mortgage | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2008 | (none) | [values] | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2014 | (none) | [values] | Annual (17 Yrs) | Annual (17 Yrs) | -| prism | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) | -| quarterly | (none) | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) | -| raa | (none) | [values] | Annual (10 Yrs) | Annual (10 Yrs) | -| tail_sample | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| ukmotor | (none) | [values] | Annual (7 Yrs) | Annual (7 Yrs) | -| usaa | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| usauto | (none) | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| xyz | (none) | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) | diff --git a/scripts/regen_sample_data_docs.py b/scripts/regen_sample_data_docs.py deleted file mode 100644 index 5c250c0a..00000000 --- a/scripts/regen_sample_data_docs.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -"""Regenerate docs/library/sample_data.md from the sample-dataset manifest. - -The sample-data documentation table used to be maintained by hand, which let it -drift out of sync with the actual datasets (missing rows, typo'd names, wrong -grain). It is now generated from the single source of truth, -``chainladder/utils/data/_manifest.py``, by way of :func:`chainladder.list_samples`. - -Run from the repository root after adding or changing a sample dataset:: - - python scripts/regen_sample_data_docs.py - -The script overwrites ``docs/library/sample_data.md`` in place. -""" -from __future__ import annotations - -from pathlib import Path - -import chainladder as cl - -REPO_ROOT = Path(__file__).resolve().parent.parent -DOCS_PATH = REPO_ROOT / "docs" / "library" / "sample_data.md" - -HEADER = """# Sample Dataset - -Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes. - -You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`. - -This table is generated from the sample-dataset manifest -(`chainladder/utils/data/_manifest.py`) via `cl.list_samples()`. To regenerate it, -run `python scripts/regen_sample_data_docs.py` from the repository root. - -""" - - -def _fmt_list(value) -> str: - """Render an index/columns cell.""" - if value is None: - return "(none)" - return "[" + ", ".join(str(v) for v in value) + "]" - - -# Unit noun used in the "(N units)" suffix on each grain label. -_GRAIN_UNITS: dict = { - "Annual": "Yrs", - "Semiannual": "Half-Yrs", - "Quarter": "Qtrs", - "Month": "months", -} - - -def _fmt_grain(label: str, periods: int) -> str: - """Render a grain cell, e.g. 'Annual (10 Yrs)'.""" - unit = _GRAIN_UNITS.get(label, label) - return f"{label} ({periods} {unit})" - - -def build_table() -> str: - df = cl.list_samples() - - rows = [ - "| Dataset Name | Indexes | Columns | Origin Grain | Development Grain |", - "|---|---|---|---|---|", - ] - for name, row in df.iterrows(): - rows.append( - "| {name} | {index} | {columns} | {origin} | {development} |".format( - name=name, - index=_fmt_list(row["index"]), - columns=_fmt_list(row["columns"]), - origin=_fmt_grain(row["origin_grain"], row["origin_periods"]), - development=_fmt_grain( - row["development_grain"], row["development_periods"] - ), - ) - ) - return "\n".join(rows) + "\n" - - -def main() -> None: - content = HEADER + build_table() - DOCS_PATH.write_text(content) - print(f"Wrote {DOCS_PATH.relative_to(REPO_ROOT)}") - - -if __name__ == "__main__": - main()