diff --git a/MANIFEST.in b/MANIFEST.in index 0eed2f16..b0773a5f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,25 +2,4 @@ include LICENSE include requirements.txt include README.rst -include chainladder/utils/data/abc.csv -include chainladder/utils/data/auto.csv -include chainladder/utils/data/clrd.csv -include chainladder/utils/data/clrd2025.csv -include chainladder/utils/data/genins.csv -include chainladder/utils/data/liab.csv -include chainladder/utils/data/m3ir5.csv -include chainladder/utils/data/mcl.csv -include chainladder/utils/data/mortgage.csv -include chainladder/utils/data/mw2008.csv -include chainladder/utils/data/mw2014.csv -include chainladder/utils/data/quarterly.csv -include chainladder/utils/data/raa.csv -include chainladder/utils/data/ukmotor.csv -include chainladder/utils/data/usaa.csv -include chainladder/utils/data/usauto.csv -include chainladder/utils/data/cc_sample.csv -include chainladder/utils/data/ia_sample.csv -include chainladder/utils/data/prism.csv -include chainladder/utils/data/tail_sample.csv -include chainladder/utils/data/berqsherm.csv -include chainladder/utils/data/xyz.csv \ No newline at end of file +recursive-include chainladder/utils/data *.csv diff --git a/chainladder/utils/__init__.py b/chainladder/utils/__init__.py index 1800834f..3f665072 100644 --- a/chainladder/utils/__init__.py +++ b/chainladder/utils/__init__.py @@ -12,6 +12,7 @@ read_json, concat, load_sample, + list_samples, minimum, maximum, PatsyFormula, diff --git a/chainladder/utils/data/_manifest.py b/chainladder/utils/data/_manifest.py new file mode 100644 index 00000000..c0f65139 --- /dev/null +++ b/chainladder/utils/data/_manifest.py @@ -0,0 +1,356 @@ +"""Central registry of bundled sample datasets. + +Single source of truth for the metadata of every CSV in +``chainladder/utils/data/``. Consumed by: + +* :func:`chainladder.load_sample` -- to build the ``Triangle`` for a sample. +* :func:`chainladder.list_samples` -- to list available samples. +* ``docs/library/sample_data.ipynb`` -- renders the sample table live via + ``cl.list_samples()``. +* ``MANIFEST.in`` -- ships ``chainladder/utils/data/*.csv`` via a wildcard. + +Adding a new sample dataset is a one-entry change here (plus dropping the +CSV in ``chainladder/utils/data/``); ``load_sample``, the docs table, and the +tests all key off this dict, so the metadata no longer has to be repeated in +three places. + +Each entry maps the sample name (the CSV filename without extension, lower +case) to the keyword arguments passed to ``Triangle``: + +``origin`` + Column name(s) for the origin period. +``development`` + Column name(s) for the development period. +``index`` + Column name(s) used as the Triangle index, or ``None``. +``columns`` + Measure column name(s) loaded into the Triangle. +``cumulative`` + ``True`` if the measures are cumulative, ``False`` if incremental. +""" + +SAMPLES: dict = { + + "abc": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "auto": { + "origin": 'origin', + "development": 'development', + "index": ['lob'], + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "berqsherm": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['LOB'], + "columns": ['Incurred', 'Paid', 'Reported', 'Closed'], + "cumulative": True, + }, + "cc_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['loss', 'exposure'], + "cumulative": True, + }, + "clrd": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['GRNAME', 'LOB'], + "columns": ['IncurLoss', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'], + "cumulative": True, + }, + "clrd2025": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": ['GRNAME', 'LOB'], + "columns": ['IncurredLosses', 'CumPaidLoss', 'BulkLoss', 'EarnedPremDIR', 'EarnedPremCeded', 'EarnedPremNet'], + "cumulative": True, + }, + "friedland_auto_bi_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_auto_freq_sev": { + "origin": 'Accident Half-Year', + "development": 'Calendar Half-Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severity'], + "cumulative": True, + }, + "friedland_auto_salsub": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Salvage and Subrogation', 'Received Salvage and Subrogation', 'Reported Claims', 'Paid Claims'], + "cumulative": True, + }, + "friedland_autoprop": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported ALAE', 'Paid ALAE', 'Reported Claims', 'Paid Claims'], + "cumulative": True, + }, + "friedland_berq_sher_auto": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate'], + "cumulative": True, + }, + "friedland_gl_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Disposal Rate', 'Paid Claims'], + "cumulative": True, + }, + "friedland_med_mal": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Case Outstanding', 'Open Claim Counts'], + "cumulative": True, + }, + "friedland_qs": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Net to Gross'], + "cumulative": True, + }, + "friedland_us_auto_chg_prod_mix": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_auto_incr_claim": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_auto_steady_state": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_industry_auto": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_us_industry_auto_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Outstanding', 'Paid Claims'], + "cumulative": True, + }, + "friedland_uspp_auto_increasing_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_auto_increasing_claim": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_auto_steady_state": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_uspp_increasing_claim_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Reported Claims', 'Paid Claims', 'Earned Premium'], + "cumulative": True, + }, + "friedland_wc_self_insurer": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Paid Claims', 'Paid Severities', 'Reported Claims', 'Reported Severities'], + "cumulative": True, + }, + "friedland_xol": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Gross Reported Claims', 'Net Reported Claims', 'Ceded Reported Claims'], + "cumulative": True, + }, + "friedland_xyz_auto_bi": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Paid Claims', 'Reported Claims'], + "cumulative": True, + }, + "friedland_xyz_case": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Outstanding', 'Paid Claims'], + "cumulative": True, + }, + "friedland_xyz_disp": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Disposal Rate', 'Closed Claim Counts', 'Paid Claims'], + "cumulative": True, + }, + "friedland_xyz_freq_sev": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Closed Claim Counts', 'Reported Claim Counts', 'Reported Claims', 'Reported Severities'], + "cumulative": True, + }, + "genins": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "ia_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['loss', 'exposure'], + "cumulative": True, + }, + "liab": { + "origin": 'origin', + "development": 'development', + "index": ['lob'], + "columns": ['values'], + "cumulative": True, + }, + "m3ir5": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mack_1997": { + "origin": 'Accident Year', + "development": 'Calendar Year', + "index": None, + "columns": ['Case Incurred'], + "cumulative": True, + }, + "mcl": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "mortgage": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mw2008": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "mw2014": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "prism": { + "origin": 'AccidentDate', + "development": 'PaymentDate', + "index": ['ClaimNo', 'Line', 'Type', 'ClaimLiability', 'Limit', 'Deductible'], + "columns": ['reportedCount', 'closedPaidCount', 'Paid', 'Incurred'], + "cumulative": False, + }, + "quarterly": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "raa": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "tail_sample": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "ukmotor": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['values'], + "cumulative": True, + }, + "usaa": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "usauto": { + "origin": 'origin', + "development": 'development', + "index": None, + "columns": ['incurred', 'paid'], + "cumulative": True, + }, + "xyz": { + "origin": 'AccidentYear', + "development": 'DevelopmentYear', + "index": None, + "columns": ['Incurred', 'Paid', 'Reported', 'Closed', 'Premium'], + "cumulative": True, + }, +} diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py index 0aeffd27..f3608cc1 100644 --- a/chainladder/utils/tests/test_utilities.py +++ b/chainladder/utils/tests/test_utilities.py @@ -8,6 +8,7 @@ __dt64_unit__ ) from chainladder.utils.utility_functions import date_delta_adjustment +from chainladder.utils.data._manifest import SAMPLES from pathlib import Path @@ -130,23 +131,98 @@ def test_invalid_sample() -> None: def test_load_sample() -> None: """ - Tests whether the supported sample data sets load. + Tests whether every sample data set declared in the manifest loads. + + Iterating over the manifest (rather than globbing the data directory) + means adding a new sample is a one-entry change in + ``chainladder/utils/data/_manifest.py`` and this test picks it up + automatically, while non-data files in the folder (``__init__.py``, + ``_manifest.py``) are never mistaken for datasets. """ + # Every manifest entry must load and have a matching CSV on disk. + data_dir: Path = Path(__file__).parent.parent / "data" + for dataset in SAMPLES: + assert (data_dir / f"{dataset}.csv").is_file(), ( + f"manifest lists '{dataset}' but {dataset}.csv is missing" + ) + cl.load_sample(dataset) + + # Conversely, every CSV on disk must be declared in the manifest, so a + # newly added data file can't silently go unregistered. + csv_stems = {f.stem for f in data_dir.glob("*.csv")} + assert csv_stems == set(SAMPLES), ( + "manifest and data directory are out of sync: " + f"only in dir={csv_stems - set(SAMPLES)}, " + f"only in manifest={set(SAMPLES) - csv_stems}" + ) - # Get the folder containing the datasets. - data_dir: Path = Path(__file__).parent.parent / 'data' - # Files to exclude from cl.load_sample(). - files_to_excl: list = [ - '__init__' - ] +def test_list_samples() -> None: + """ + Tests cl.list_samples(): the manifest-driven catalog of bundled datasets. + """ + df = cl.list_samples() + # One row per manifest entry, indexed by sample name. + assert df.index.name == "name" + assert set(df.index) == set(SAMPLES) + assert {"index", "columns", "cumulative", "origin_grain", "development_grain"} <= set(df.columns) - # Gather list of files to test. - datasets = [f.stem for f in data_dir.iterdir() if f.is_file() and f.stem not in files_to_excl] + # The fast path skips loading data and therefore omits the grain columns. + fast = cl.list_samples(include_grain=False) + assert set(fast.index) == set(SAMPLES) + assert "origin_grain" not in fast.columns + assert "development_grain" not in fast.columns - # Load each file. - for dataset in datasets: - cl.load_sample(dataset) + # Metadata matches the manifest source of truth. + assert df.loc["clrd2025", "columns"] == SAMPLES["clrd2025"]["columns"] + assert df.loc["prism", "cumulative"] == SAMPLES["prism"]["cumulative"] + + +def test_sdist_ships_all_samples(tmp_path) -> None: + """ + Build a source distribution and assert it contains every sample CSV. + + This is the guard against MANIFEST.in drifting out of sync with the data + folder again (the bug behind #774: the old per-file include list shipped + only 22 of the bundled CSVs). It is deliberately self-skipping rather than + a hard requirement of the fast suite: it needs the ``build`` package and a + source checkout (a pyproject.toml at the repo root), and it shells out to a + full sdist build, so it no-ops in environments that lack either. + """ + import subprocess + import sys + import tarfile + + pytest.importorskip("build", reason="requires the build package") + + # Locate the repo root (the directory containing pyproject.toml). When + # running from an installed wheel there is no source tree, so skip. + repo_root: Path = Path(__file__).resolve().parents[3] + if not (repo_root / "pyproject.toml").is_file(): + pytest.skip("not running from a source checkout") + + data_dir: Path = Path(__file__).parent.parent / "data" + expected_csvs = {f.name for f in data_dir.glob("*.csv")} + + subprocess.run( + [sys.executable, "-m", "build", "--sdist", "--outdir", str(tmp_path)], + cwd=repo_root, + check=True, + capture_output=True, + ) + + sdists = list(tmp_path.glob("*.tar.gz")) + assert len(sdists) == 1, f"expected one sdist, found {sdists}" + + with tarfile.open(sdists[0]) as tar: + shipped = { + Path(name).name + for name in tar.getnames() + if "/utils/data/" in name and name.endswith(".csv") + } + + missing = expected_csvs - shipped + assert not missing, f"sdist is missing sample CSVs: {sorted(missing)}" def test_load_sample_clrd2025() -> None: diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py index a124a37f..7e8ac606 100644 --- a/chainladder/utils/utility_functions.py +++ b/chainladder/utils/utility_functions.py @@ -15,6 +15,7 @@ __dt64_dtype__ ) from chainladder.utils.sparse import sp +from chainladder.utils.data._manifest import SAMPLES from io import StringIO from patsy import dmatrix # noqa from sklearn.base import BaseEstimator, TransformerMixin @@ -43,7 +44,7 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: Datasets that are commonly used in examples are: raa, clrd, and prism. - And a complete list of available datasets is: abc, auto, berqsherm, cc_sample, clrd, clrd2025, genins, ia_sample, liab, m3ir5, mack_1997, mcl, mortgage, mw2008, mw2014, prism, quarterly, raa, tail_sample, ukmotor, usaa, usauto, xyz. + For the complete list of available datasets, call :func:`list_samples`. Returns ------- @@ -86,10 +87,10 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: # Set base path to be the parent directory of this file, e.g., the utils folder. utils_path: AnyStr = os.path.dirname(os.path.abspath(__file__)) - # Validate that the file indicated by the key argument exists. - dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv") - - if not os.path.exists(dataset_path): + # Validate the key against the sample-dataset manifest. The manifest is the + # authoritative list of available samples; every entry has a matching CSV in + # the data folder. + if key.lower() not in SAMPLES: raise ValueError( """ Invalid key supplied. The key should match the name, without extension, of one of the file names @@ -102,162 +103,19 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: ) ) - # Set initial values for arguments to Triangle __init__. These may be overridden by - # values specific to the data set. - origin: str = "origin" - development: str = "development" - columns: list = ["values"] - index: list | None = None - cumulative: bool = True - - if key.lower() in ["mcl", "usaa", "quarterly", "auto", "usauto", "tail_sample"]: - columns: list = ["incurred", "paid"] - if key.lower() == "clrd": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["GRNAME", "LOB"] - columns: list = [ - "IncurLoss", - "CumPaidLoss", - "BulkLoss", - "EarnedPremDIR", - "EarnedPremCeded", - "EarnedPremNet", - ] - if key.lower() == "clrd2025": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["GRNAME", "LOB"] - columns: list = [ - "IncurredLosses", - "CumPaidLoss", - "BulkLoss", - "EarnedPremDIR", - "EarnedPremCeded", - "EarnedPremNet", - ] - if key.lower() == "berqsherm": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - index: list = ["LOB"] - columns: list = ["Incurred", "Paid", "Reported", "Closed"] - if key.lower() == "xyz": - origin: str = "AccidentYear" - development: str = "DevelopmentYear" - columns: list = ["Incurred", "Paid", "Reported", "Closed", "Premium"] - if key.lower() in ["liab", "auto"]: - index: list = ["lob"] - if key.lower() in ["cc_sample", "ia_sample"]: - columns: list = ["loss", "exposure"] - if key.lower() in ["prism"]: - columns: list = ["reportedCount", "closedPaidCount", "Paid", "Incurred"] - index: list = [ - "ClaimNo", - "Line", - "Type", - "ClaimLiability", - "Limit", - "Deductible", - ] - origin: str = "AccidentDate" - development: str = "PaymentDate" - cumulative: bool = False - if "mack_1997" in key.lower(): - columns = ["Case Incurred"] - origin = "Accident Year" - development = "Calendar Year" - cumulative: bool = True - # Friedland datasets - if "friedland" in key.lower(): - columns: list = ["Paid Claims", "Reported Claims"] - origin: str = "Accident Year" - development: str = "Calendar Year" - cumulative: bool = True - index: None = None - if "autoprop" in key.lower(): - columns: list = [ - "Reported ALAE", - "Paid ALAE", - "Reported Claims", - "Paid Claims", - ] - if "auto_salsub" in key.lower(): - columns: list = [ - "Reported Salvage and Subrogation", - "Received Salvage and Subrogation", - "Reported Claims", - "Paid Claims", - ] - if "berq_sher_auto" in key.lower(): - columns: list = [ - "Paid Claims", - "Closed Claim Counts", - "Reported Claim Counts", - "Disposal Rate", - ] - if "gl_insurer" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Disposal Rate", - "Paid Claims", - ] - if "med_mal" in key.lower(): - columns: list = [ - "Reported Claims", - "Paid Claims", - "Case Outstanding", - "Open Claim Counts", - ] - if "qs" in key.lower(): - columns: list = [ - "Gross Reported Claims", - "Net Reported Claims", - "Net to Gross", - ] - if "auto_case" in key.lower(): - columns: list = ["Case Outstanding", "Paid Claims"] - if "wc_self_insurer" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Paid Claims", - "Paid Severities", - "Reported Claims", - "Reported Severities", - ] - if "xol" in key.lower(): - columns: list = [ - "Gross Reported Claims", - "Net Reported Claims", - "Ceded Reported Claims", - ] - if "xyz_case" in key.lower(): - columns: list = ["Case Outstanding", "Paid Claims"] - if "xyz_disp" in key.lower(): - columns: list = ["Disposal Rate", "Closed Claim Counts", "Paid Claims"] - if "xyz_freq_sev" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Reported Claims", - "Reported Severities", - ] - if "auto_freq_sev" in key.lower(): - columns: list = [ - "Closed Claim Counts", - "Reported Claim Counts", - "Reported Claims", - "Reported Severity", - ] - origin: str = "Accident Half-Year" - development: str = "Calendar Half-Year" - if "uspp" in key.lower(): - columns: list = [ - "Reported Claims", - "Paid Claims", - "Earned Premium" - ] + dataset_path: str = os.path.join(utils_path, "data", key.lower() + ".csv") + + # Look up the Triangle configuration for this sample from the central + # manifest (chainladder/utils/data/_manifest.py). The manifest is the + # single source of truth for sample-dataset metadata, replacing the long + # per-dataset if/elif chain that previously lived here and duplicated the + # column names already present in the tests and the sample-data docs. + config: dict = SAMPLES[key.lower()] + origin = config["origin"] + development = config["development"] + index = config["index"] + columns = config["columns"] + cumulative = config["cumulative"] df = pd.read_csv(filepath_or_buffer=dataset_path) @@ -273,6 +131,72 @@ def load_sample(key: str, *args, **kwargs) -> Triangle: ) +# Human-readable labels for the single-character grain codes a Triangle exposes +# via ``origin_grain`` / ``development_grain``. +_GRAIN_LABELS: dict = { + "Y": "Annual", + "S": "Semiannual", + "Q": "Quarter", + "M": "Month", +} + + +def list_samples(include_grain: bool = True) -> DataFrame: + """List the sample datasets bundled with the chainladder package. + + The returned table is driven by the sample-dataset manifest + (``chainladder/utils/data/_manifest.py``), the same source + :func:`load_sample` reads, so it always reflects exactly what is loadable. + + Parameters + ---------- + include_grain: bool + If ``True`` (default), load each sample to report its origin and + development grain (and the number of origin/development periods). This + is the slower path because every Triangle is built. Set to ``False`` to + return just the manifest metadata (name, index, columns, cumulative) + without loading any data. + + Returns + ------- + pandas.DataFrame indexed by sample name, with columns ``index``, + ``columns``, ``cumulative`` and, when ``include_grain`` is ``True``, + ``origin_grain``, ``development_grain``, ``origin_periods`` and + ``development_periods``. + + Examples + -------- + + .. code-block:: python + + import chainladder as cl + cl.list_samples() # full table, grain included + cl.list_samples(include_grain=False) # fast, metadata only + """ + records: list = [] + for name in sorted(SAMPLES): + config: dict = SAMPLES[name] + record: dict = { + "name": name, + "index": config["index"], + "columns": config["columns"], + "cumulative": config["cumulative"], + } + if include_grain: + triangle = load_sample(name) + record["origin_grain"] = _GRAIN_LABELS.get( + triangle.origin_grain, triangle.origin_grain + ) + record["development_grain"] = _GRAIN_LABELS.get( + triangle.development_grain, triangle.development_grain + ) + record["origin_periods"] = len(triangle.origin) + record["development_periods"] = triangle.development.shape[0] + records.append(record) + + return pd.DataFrame.from_records(records).set_index("name") + + def read_pickle(path): with open(path, "rb") as pkl: return dill.load(pkl) diff --git a/docs/_toc.yml b/docs/_toc.yml index 6229fceb..b7c6a219 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -44,7 +44,7 @@ parts: sections: - file: library/usage.md - file: library/references.md - - file: library/sample_data.md + - file: library/sample_data.ipynb - file: library/glossary.md - file: library/questions_issues.md - file: library/contributing.md diff --git a/docs/library/api.md b/docs/library/api.md index d0985fab..7565e421 100644 --- a/docs/library/api.md +++ b/docs/library/api.md @@ -165,10 +165,10 @@ Functions :template: function.rst load_sample + list_samples read_pickle read_json concat - load_sample minimum maximum diff --git a/docs/library/sample_data.ipynb b/docs/library/sample_data.ipynb new file mode 100644 index 00000000..d983c656 --- /dev/null +++ b/docs/library/sample_data.ipynb @@ -0,0 +1,52 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample Dataset\n", + "\n", + "The `chainladder` package ships with a number of sample datasets that are\n", + "used throughout the documentation and examples. Load any of them with\n", + "`cl.load_sample(...)`, for example `cl.load_sample(\"abc\")`.\n", + "\n", + "The table below lists every available dataset alongside its basic\n", + "attributes. It is produced live by `cl.list_samples()`, which reads from the\n", + "sample-dataset manifest (`chainladder/utils/data/_manifest.py`), so it never\n", + "drifts out of sync with the datasets that actually ship." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import chainladder as cl\n", + "\n", + "cl.list_samples()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/library/sample_data.md b/docs/library/sample_data.md deleted file mode 100644 index b6a7dc07..00000000 --- a/docs/library/sample_data.md +++ /dev/null @@ -1,57 +0,0 @@ -# Sample Dataset - -Below is the list of all datasets that come included with the `chainladder` package, and their basic attributes. - -You can load any dataset with `cl.load_sample(...)` such as `cl.load_sample("abc")`. - -\* Denotes datasets that are more interesting and possess unique characteristics. - - -| Dataset Name | Indexes | Columns | Origin Grain | Development Grain | -|----------------------------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------|--------------------------| -| abc | (none) | (none) | Annual (11 Yrs) | Annual (11 Yrs) | -| auto | [lob] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| berqsherm | [LOB] | [Incurred, Paid, Reported, Closed] | Annual (8 Yrs) | Annual (8 Yrs) | -| cc_sample | [Total] | [loss, exposure] | Annual (5 Yrs) | Annual (5 Yrs) | -| clrd * | [GRNAME, LOB] | [IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | -| clrd2025 * | [GRNAME, LOB] | [IncurredLosses, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_auto_bi_insurer* | (none) | [Paid Claims, Reported Claims] | Annual (9 Yrs) | Annual (9 Yrs) | -| friedland_auto_freq_sev* | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severity] | Semiannual (10 Half-Yrs) | Semiannual (10 Half-Yrs) | -| friedland_autoprop | (none) | [Reported ALAE, Paid ALAE, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_auto_salsub | (none) | [Reported Salvage and Subrogation, Received Salvage and Subrogation, Reported Claims, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_berq_sher_auto | (none) | [Paid Claims, Closed Claim Counts, Reported Claim Counts, Disposal Rate] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_gl_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Disposal Rate Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_med_mal | (none) | [Reported Claims, Paid Claims, Case Outstanding, Open Claim Counts] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_qs | (none) | [Gross Reported Claims, Net Reported Claims, Net to Gross] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_us_auto_chg_prod_mix | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industry_auto | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_us_industy_auto_case | (none) | [Case Outstanding, Paid Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_incr_case | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_auto_incr_claim | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| frieldand_uspp_auto_steady_state | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_uspp_incr_claim_case | (none) | [Paid Claims, Reported Claims] | Annual (10 Yrs) | Annual (10 Yrs) | -| friedland_wc_self_insurer | (none) | [Closed Claim Counts, Reported Claim Counts, Paid Claims, Paid Severities, Reported Claims, Reported Severities] | Annual (8 Yrs) | Annual (10 Yrs) | -| friedland_xol | (none) | [Gross Reported Claims, Net Reported Claims, Ceded Reported Claims] | Annual (4 Yrs) | Annual (4 Yrs) | -| friedland_xyz_auto_bi* | (none) | [Paid Claims, Reported Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_case* | (none) | [Case Outstanding, Paid Claims] | Annual (11 Yrs) | Annual (11 Yrs) | -| friedland_xyz_disp | (none) | [Disposal Rate, Closed Claim Counts, Paid Claims] | Annual (8 Yrs) | Annual (8 Yrs) | -| friedland_xyz_freq_sev* | (none) | [Closed Claim Counts, Reported Claim Counts, Reported Claims, Reported Severities] | Annual (11 Yrs) | Annual (11 Yrs) | -| genins | (none) | (none) | Annual (10 Yrs) | Annual (10 Yrs) | -| ia_sample | [Total] | [loss, exposure] | Annual (6 Yrs) | Annual (6 Yrs) | -| liab * | [lob] | [values] | Annual (14 Yrs) | Annual (14 Yrs) | -| mack_1997 | (none) | [Case Incurred] | Annual (10 Yrs) | Annual (10 Yrs) | -| m3ir5 | (none) | (none) | Annual (14 Yrs) | Annual (14 Yrs) | -| mcl | [Total] | [incurred, paid] | Annual (7 Yrs) | Annual (7 Yrs) | -| mortgage | (none) | (none) | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2008 | (none) | (none) | Annual (9 Yrs) | Annual (9 Yrs) | -| mw2014 | (none) | (none) | Annual (17 Yrs) | Annual (17 Yrs) | -| prism * | [ClaimNo, Line, Type, ClaimLiability, Limit, Deductible] | [reportedCount, closedPaidCount, Paid, Incurred] | Month (120 months) | Month (120 months) | -| quarterly * | [Total] | [incurred, paid] | Annual (12 Yrs) | Quarter (45 Qtrs) | -| raa | (none) | (none) | Annual (10 Yrs) | Annual (10 Yrs) | -| tail_sample | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| ukmotor | (none) | (none) | Annual (7 Yrs) | Annual (7 Yrs) | -| usaa | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| usauto | [Total] | [incurred, paid] | Annual (10 Yrs) | Annual (10 Yrs) | -| xyz* | [Total] | [Incurred, Paid, Reported, Closed, Premium] | Annual (11 Yrs) | Annual (11 Yrs) |