diff --git a/.github/workflows/deploy-docs.yaml b/.github/workflows/deploy-docs.yaml new file mode 100644 index 000000000..775e72d73 --- /dev/null +++ b/.github/workflows/deploy-docs.yaml @@ -0,0 +1,48 @@ +name: Deploy documentation + +on: + push: + branches: [ main ] + paths: + - "docs/**" + - "mkdocs.yml" + - ".github/workflows/deploy-docs.yaml" + pull_request: + paths: + - "docs/**" + - "mkdocs.yml" + - ".github/workflows/deploy-docs.yaml" + workflow_dispatch: + +permissions: + contents: write + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install MkDocs Material + run: pip install mkdocs-material + - name: Build site (strict) + run: mkdocs build --strict + + deploy: + needs: validate + if: github.event_name != 'pull_request' + runs-on: ubuntu-latest + concurrency: + group: deploy-docs + cancel-in-progress: false + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install MkDocs Material + run: pip install mkdocs-material + - name: Deploy to GitHub Pages + run: mkdocs gh-deploy --force diff --git a/README.md b/README.md index 33aa44000..a772891a6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,13 @@ -# HH -> bbtautau Framework +# FLAF -FLAF - Flexible LAW-based Analysis Framework. -Task workflow managed is done via [LAW](https://github.com/riga/law) (Luigi Analysis Framework). +**FLAF** — the Flexible LAW-based Analysis Framework — is the shared CMS analysis framework used by +the HH→bb̄ττ, HH→bb̄WW and H→μμ analyses. Task workflows are managed with +[LAW](https://github.com/riga/law) (the Luigi Analysis Workflow). -Documentation is available on [GitHub Pages](https://cms-flaf.github.io/FLAF/). +📖 **Documentation: ** + +FLAF is included as a git submodule inside each analysis repository — you do not clone it on its own +to run an analysis. Start with the +[installation guide](https://cms-flaf.github.io/FLAF/getting-started/installation/). diff --git a/docs/analyses.md b/docs/analyses.md new file mode 100644 index 000000000..119be3db1 --- /dev/null +++ b/docs/analyses.md @@ -0,0 +1,41 @@ +# Analyses + +FLAF is shared by three analyses. The **common** pipeline is documented here; each analysis adds +its own physics — extra submodules, observables, signals and (for the HH analyses) statistical +inference — documented in that analysis's own `docs/`. + +| Analysis | Channel | Adds on top of FLAF | Docs | +|---|---|---|---| +| **HH→bb̄ττ** | HH → bb̄ττ | SVfit (`ClassicSVfit`, `SVfitTF`), `HHKinFit2`, `HHbtag`, DeepTau; resonant + non-resonant signals; `StatInference`. | [github.com/cms-flaf/HH_bbtautau](https://github.com/cms-flaf/HH_bbtautau) → `docs/` | +| **HH→bb̄WW** | HH → bb̄WW | `DeepHME` mass reconstruction; b-tag-shape caching (`AnalysisCacheTask`); `StatInference`. | [github.com/cms-flaf/HH_bbWW](https://github.com/cms-flaf/HH_bbWW) → `docs/` | +| **H→μμ** | H → μμ | Single-Higgs; the simplest setup (just `FLAF` + `Corrections`); **no** statistical-inference submodule. | [github.com/cms-flaf/H_mumu](https://github.com/cms-flaf/H_mumu) → `docs/` | + +## What is common vs analysis-specific + +- **Common (here, in FLAF):** the [task graph](concepts/data-flow.md), the + [configuration system](concepts/configuration.md), the [environment](concepts/environment.md), + [storage](concepts/storage.md), [eras](concepts/eras.md) and [CI](ci/integration-pipeline.md). + The [full-workflow walkthrough](workflow/walkthrough.md) applies to every analysis. +- **Analysis-specific (in each repo's `docs/`):** the extra physics submodules and how to set them + up, the analysis's signals and processes, its observables and any analysis-only steps, and — for + HH→bb̄ττ and HH→bb̄WW — the statistical-inference configuration. + +## HH→bb̄ττ — the reference analysis + +The most feature-complete analysis: SVfit and HHKinFit2 mass reconstruction, the HHbtag b-jet +identifier, DeepTau-based τ identification (select the version with +`--customisations deepTauVersion=2p5`), and resonant + non-resonant signal models. Used throughout +these docs as the worked example. + +## HH→bb̄WW + +Uses `DeepHME` for mass reconstruction instead of SVfit. Its pipeline inserts a b-tag-shape caching +step (`AnalysisCacheTask`/`AnalysisCacheAggregationTask`) before histogramming — see the caveat in +the [walkthrough](workflow/walkthrough.md#stage-2-compute-analysis-observables-histtuples) and +[Task reference](reference/tasks.md#analysiscachetask). + +## H→μμ + +A single-Higgs analysis with the leanest submodule set (no `StatInference`/`inference`). Its CI +runs over **all** Run 3 eras (`H_mumu_eras: ALL`), and its CI process names are lower-case +(`custom_CI_signal`, …) — see [Processes & models](configuration/processes-and-models.md). diff --git a/docs/analysis.md b/docs/analysis.md deleted file mode 100644 index 9ba54893d..000000000 --- a/docs/analysis.md +++ /dev/null @@ -1,45 +0,0 @@ -# Common analysis steps - -Remarks: - -- commands bellow assume that `ERA` variable is set. E.g. - ```sh - ERA=Run2_2016 - ``` - Alternatively you can add `ERA=Run2_2016; ...` in front of each command. - -- `version` argument alows to produce different versions of the same task. In the command below `--version dev` is used for illustration purposes. You can replace it with your version naming. -- `--workflow` can be `htcondor` or `local`. It is recommended to develop and test locally and then switch to `htcondor` for production. In examples below `--workflow local` is used for illustration purposes. -- when running on `htcondor` it is recommended to add `--transfer-logs` to the command to transfer logs to local. -- `--customisations` argument is used to pass custom parameters to the task in form param1=value1,param2=value2,... - IMPORTANT for HHbbTauTau analysis: if running using deepTau 2p5 add `--customisations deepTauVersion=2p5` -- if you want to run only on few files, you can specify list of branches to run using `--branches` argument. E.g. `--branches 2,7-10,17`. -- to get status, use `--print-stauts N,K` where N is depth for task dependencies, K is depths for file dependencies. E.g. `--print-status 3,1`. -- to remove task output use `--remove-output N,a`, where N is depth for task dependencies. E.g. `--remove-output 0,a`. -- it is highly recommended to limitate the maximum number of parallel jobs running adding `--parallel-jobs M` where M is the number of the parallel jobs (e.g. M=100) - -## Create input file list - -```sh -law run InputFileTask --period ${ERA} --version dev -``` - -## Create anaCache - -```sh -law run AnaCacheTask --period ${ERA} --version dev -``` - -## Create anaTuple - -```sh -law run AnaTupleTask --period ${ERA} --version dev -``` - -## Merge data - -```sh -law run DataMergeTask --period ${ERA} --version dev -``` - -- note: It's very important to first run `InputFileTask` then the other tasks dependencies are automatically fixed (e.g. if running `AnaTupleTask` without `AnaCacheTask`, it will first run `AnaCacheTask` then `AnaTupleTask`). If you do not run `InputFileTask`, running other tasks will raise error. \ No newline at end of file diff --git a/docs/ci/github-actions.md b/docs/ci/github-actions.md new file mode 100644 index 000000000..02ecdb52b --- /dev/null +++ b/docs/ci/github-actions.md @@ -0,0 +1,67 @@ +# GitHub Actions + +FLAF uses **two** continuous-integration systems: + +| System | Where | Purpose | +|---|---|---| +| **GitHub Actions** | GitHub | Fast code-quality and sanity checks on every pull request. | +| **FLAF integration** | GitLab CI (CERN) | The full pipeline run that checks physics correctness. Triggered by a bot comment — see [Integration pipeline](integration-pipeline.md). | + +This page covers the GitHub Actions checks. + +## Shared, reusable workflows + +The analysis repositories don't duplicate CI logic. Each workflow is a thin wrapper that calls the +shared implementation in FLAF: + +```yaml +jobs: + my-job: + uses: cms-flaf/FLAF/.github/workflows/.yaml@main + secrets: inherit +``` + +So fixing a check in FLAF fixes it everywhere. (A checkout helper inside the shared workflows makes +the FLAF tooling — `.yamllint`, `.clang-format` — available even though FLAF is a submodule.) + +## The standard checks + +| Workflow | Runs on | What it checks | +|---|---|---| +| `formatting-check.yaml` | PRs | Code style: **flake8**/black (Python), **clang-format** (C++), **yamllint** (YAML). | +| `repo-sanity-checks.yaml` | PRs | Submodule-pointer consistency, repository health, no stray binary files. | +| `test-setup-loading.yaml` | PRs | Actually loads `Setup.py` for **every configured era** — catches config typos and broken references early (a real run, not a dry run). | +| `trigger-flaf-integration.yaml` | PR comments | Parses a `@cms-flaf-bot` comment and triggers the GitLab pipeline. See [Integration pipeline](integration-pipeline.md). | + +FLAF itself additionally runs: + +| Workflow | What it checks | +|---|---| +| `cross-section-check.yaml` | Cross-section values are consistent/valid. | +| `ds-consistency-check.yaml` | `datasets.yaml` entries are well-formed (generator, resolvable cross-section, naming) via `test/checkDatasetConfigConsistency.py`. | + +## Passing the checks before you push + +Formatting is enforced, so format **before** committing. The convenience script applies all +formatters at once (with `flaf_env` active): + +```sh +bash run_tools/apply_format.sh +``` + +Or run them individually: + +```sh +black # Python +clang-format -i --style "file:.clang-format" # C++ +yamllint -s -c .yamllint # YAML +``` + +If you edited `datasets.yaml`, also run the consistency check from +[Datasets](../configuration/datasets.md#validate-the-dataset-config). See +[Contributing](../contributing.md) for the full pre-PR checklist. + +!!! note "Required secrets" + The bot-trigger workflow needs the org-level secrets `FLAF_INTEGRATION_TOKEN` (GitLab trigger) + and `FLAF_GITHUB_TOKEN` (to post the reply comment), inherited via `secrets: inherit`. The + quality checks need no secrets. diff --git a/docs/ci/integration-pipeline.md b/docs/ci/integration-pipeline.md new file mode 100644 index 000000000..915b60e94 --- /dev/null +++ b/docs/ci/integration-pipeline.md @@ -0,0 +1,103 @@ +# Integration pipeline + +The **FLAF integration pipeline** runs the actual analysis pipeline end-to-end (on tiny test +inputs) to check that a change produces correct results — not just that it is well formatted. It +runs on **GitLab CI at CERN** (project +[`cms-flaf/flaf_integration`](https://gitlab.cern.ch/cms-flaf/flaf_integration), project id +`210600`) and is triggered from GitHub by a bot comment. + +## Triggering it: `@cms-flaf-bot please test` + +On a pull request (in a repo that supports it), an authorised user posts a comment: + +```text +@cms-flaf-bot please test +``` + +The `trigger-flaf-integration.yaml` workflow then: + +1. checks the commenter is in `authorized_users` and the header is recognised; +2. reads `.github/integration_cfg.yaml` **from the PR's branch**; +3. substitutes the PR's own version (so the pipeline tests *this* PR); +4. triggers the GitLab pipeline and posts back a `[pipeline#…] started` comment (or a 👎 reaction if + it could not start). + +Repos with the trigger enabled: HH_bbtautau, HH_bbWW, H_mumu, FLAF, Corrections, StatInference. + +!!! tip "Test a change that spans repositories" + Add lines to point a dependency at your PR or branch, e.g.: + ```text + @cms-flaf-bot please test + - https://github.com/cms-flaf/FLAF/pull/272 + ``` + Shorthands include `- _version=PR_`, a `…/pull/` URL, a `…/tree/` URL, and + `- gitlab_branch=` to run a non-default `flaf_integration` branch. + +## `integration_cfg.yaml` + +Each participating repo has `.github/integration_cfg.yaml`. It lists who may trigger, the accepted +comment headers, and the **variables** passed to the pipeline: + +```yaml +variables: + HH_bbtautau_version: "main" + FLAF_version: "default" # "default" = keep flaf_integration's current value + Corrections_version: "default" + HH_bbtautau_active: "1" # "1" = run this analysis, "0" = skip + HH_bbtautau_task: "FLAF.Analysis.tasks.HistPlotTask" + HH_bbtautau_args: "--branches 0 --test 1000" + HH_bbtautau_eras: "Run3_2022 Run3_2022EE Run3_2023 Run3_2023BPix" + HH_bbtautau_processes: "custom_CI_Signal custom_CI_Background custom_CI_Data" + TEST_TIMEOUT: "4h" +``` + +| Variable | Meaning | +|---|---| +| `_active` | Whether to run that analysis (`1`/`0`). | +| `_version` / `_version` | Which version of a repo to use; `default` keeps the pipeline's current value. | +| `_task` | The target task (the pipeline runs everything up to it). | +| `_args` | Extra `law run` arguments (e.g. `--branches 0 --test 1000`). | +| `_eras` | Eras to test (space-separated, or `ALL`). | +| `_processes` | The processes to test (space-separated). **Required** for an active analysis — there is no default. | + +!!! warning "`_processes` must be set for an active analysis" + The pipeline **errors at generation time** if an active analysis has no `processes`. The values + live in each repo's `integration_cfg.yaml` (capitalised for HH analyses, lower-case for H→μμ — + see [Processes & models](../configuration/processes-and-models.md)). They are declared but left + empty in `flaf_integration/.gitlab-ci.yml`, so the trigger accepts them while the real values + come from the triggering repo. + +### Root packages vs packages + +The shared trigger logic distinguishes: + +- **root packages** — repos with an `_active` variable (the analyses: HH_bbtautau, HH_bbWW, + H_mumu); +- **packages** — repos with a `_version` but no `_active` (FLAF, Corrections, StatInference). + +Both may trigger the pipeline; the distinction matters only when editing the trigger logic. + +## What the pipeline does + +```mermaid +flowchart LR + P[Parent pipeline
.gitlab-ci.yml] -->|generate_child_pipeline.py| C[Child pipeline] + C --> B[build: per analysis] + B --> T1[test_dataset:
per process] + T1 --> T2[test_era / test_multi_era] + T2 --> N[notify GitHub] +``` + +- The **parent** pipeline runs `scripts/generate_child_pipeline.py`, which expands the active + analyses × eras × processes into concrete jobs (pure Python, no PyYAML on the runner). +- The **child** pipeline builds each active analysis once, then runs the requested task per + process/era on tiny inputs (`--test`), and finally notifies GitHub of success/failure. +- Disabled analyses/eras are simply not emitted; jobs are non-interruptible so parallel pipelines + on the same branch don't cancel each other. + +## Reproducing CI locally + +You can run what a CI job runs without the bot — point `fs_default` at a local path, use +`phys_model: TestModel` and `--test 1000`, and launch the target task with `--workflow local`. See +[Your first run](../getting-started/first-run.md) and the +[`user_custom.yaml` guide](../configuration/user-custom.md). diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md new file mode 100644 index 000000000..7847c0d6a --- /dev/null +++ b/docs/concepts/architecture.md @@ -0,0 +1,88 @@ +# Architecture + +FLAF is a **shared framework** used by several analyses. This page explains how the pieces fit +together — which repositories exist, how they are nested as git submodules, and what is "common" +versus "analysis-specific". + +## Repositories + +| Repository | Role | Host | +|---|---|---| +| [`FLAF`](https://github.com/cms-flaf/FLAF) | The framework: task definitions, config system, environment, CI. | GitHub | +| [`Corrections`](https://github.com/cms-flaf/Corrections) | Object corrections & systematics (pileup, b-tag, triggers, …). | GitHub | +| [`StatInference`](https://github.com/cms-flaf/StatInference) | Datacard creation and limit/fit tooling. | GitHub | +| [`inference`](https://gitlab.cern.ch/cms-flaf/inference) | The HH combine-based inference tooling (`dhi`). | CERN GitLab | +| `HH_bbtautau`, `HH_bbWW`, `H_mumu` | The **analysis** repositories. | GitHub | + +The first four are **shared** — every analysis uses the same `FLAF` and `Corrections`; the two HH +analyses also use `StatInference` and `inference`. They are pulled into each analysis as git +submodules pinned to a specific commit, so an analysis always builds against a known version of +the framework. + +## Submodule hierarchy + +You clone an **analysis** repository; it brings the shared framework and the analysis-specific +tools with it. For example, HH→bb̄ττ looks like this: + +```text +HH_bbtautau/ ← the analysis repository (you clone this) +├── FLAF/ ← shared framework (submodule) +│ ├── PlotKit/ ← plotting helpers (submodule of FLAF) +│ └── RunKit/ ← workflow utilities (vendored directory, not a submodule) +├── Corrections/ ← shared corrections (submodule) +├── StatInference/ ← shared stat tooling (submodule; HH analyses only) +├── inference/ ← HH combine tooling (submodule; HH analyses only) +├── ClassicSVfit/ SVfitTF/ ← analysis-specific physics tools (submodules) +├── HHKinFit2/ HHbtag/ ← analysis-specific physics tools (submodules) +├── SyncTool/ ← analysis-specific (submodule) +├── Analysis/ AnaProd/ ← analysis-specific task/producer code +├── config/ ← analysis-specific configuration (signals, channels, …) +├── env.sh ← the entry point you `source` +└── data/ ← local working area (proxy, small outputs) +``` + +Each analysis carries a different set of physics submodules: + +- **HH_bbtautau** — SVfit (`ClassicSVfit`, `SVfitTF`), `HHKinFit2`, `HHbtag`, `SyncTool`. +- **HH_bbWW** — `DeepHME`, `SyncTool`. +- **H_mumu** — the simplest: just `FLAF` and `Corrections` (no `StatInference`/`inference`). + +See [Analyses](../analyses.md) for what each one adds on top of the framework. + +## Common vs analysis-specific + +Understanding this split tells you *where to look* for any given thing. + +```mermaid +flowchart LR + subgraph Shared [Shared framework] + F[FLAF
tasks, config loader,
env, CI] + C[Corrections] + S[StatInference] + end + subgraph Ana [Analysis repository] + AC[config/
signals, channels,
observables] + AA[Analysis/ AnaProd/
producers, definitions] + SUB[physics submodules
SVfit, DeepHME, ...] + end + F --> Ana + C --> Ana + S --> Ana +``` + +- **In `FLAF`**: the *machinery* — the task classes (`AnaProd/tasks.py`, `Analysis/tasks.py`), the + configuration loader (`Common/Setup.py`), the environment (`env.sh`), cross-sections and the + SM-background/data dataset lists common to all analyses, and the shared CI workflows. +- **In the analysis repository**: the *physics* — which signals to run, the channels and + categories, the analysis-specific observables and producers, and the analysis-specific + configuration that overrides or extends the framework defaults. + +The [configuration system](configuration.md) is what merges these two layers together at run +time, and the [data flow](data-flow.md) is what the shared tasks actually do with them. + +!!! note "Developing the shared submodules" + Because `FLAF`/`Corrections` are pinned submodules, editing them inside an analysis checkout + requires care so your edits are actually picked up. The environment supports pointing + `FLAF_PATH`/`CORRECTIONS_PATH` at an edited copy — see + [The environment](environment.md#developing-shared-submodules) and + [Contributing](../contributing.md). diff --git a/docs/concepts/configuration.md b/docs/concepts/configuration.md new file mode 100644 index 000000000..14b7beb9f --- /dev/null +++ b/docs/concepts/configuration.md @@ -0,0 +1,75 @@ +# Configuration system + +FLAF's behaviour — which datasets exist, which corrections apply, where outputs go, which +processes make up the analysis — is driven by **YAML configuration**. This page explains *how* the +configuration is assembled. For *how to change* specific things, see the +[Configuration guide](../configuration/user-custom.md). + +The implementation lives in `FLAF/Common/Setup.py`. + +## Four layers, merged in order + +When you run a task with `--period `, FLAF loads configuration from **four directories**, in +this order: + +```python +config_path_order = [ + "/FLAF/config", # 1. framework defaults (all analyses) + "/FLAF/config/", # 2. framework defaults for this era + "/config", # 3. analysis-wide settings + "/config/", # 4. analysis settings for this era ← wins +] +``` + +Think of it as **general → specific**: the framework provides sensible cross-analysis defaults +(layer 1–2), and each analysis overrides or extends them (layer 3–4). The per-era directories let +2022 and 2023 differ without duplicating everything. + +### How values combine + +The merge rule depends on the value type, and this distinction matters: + +- **Scalars** (a string, a number, a bool): a later layer **overrides** an earlier one. So + `analysis/config/` has the final say. +- **Lists** (most importantly `datasets.yaml`): later layers **extend** (concatenate) earlier + ones. Nothing is lost. + +The list behaviour is why datasets are *split* across files instead of duplicated: SM backgrounds +and data live in the framework's `FLAF/config//datasets.yaml`, while signals and custom +samples live in the analysis's `config//datasets.yaml`. After merging, **all** of them are +available together. See [Datasets](../configuration/datasets.md). + +## The objects you may meet in code + +| Class (in `Common/Setup.py`) | Role | +|---|---| +| `Config` | Loads and merges the YAML files from the four directories for one logical config (e.g. "datasets"). Accessed like a dict: `cfg["key"]`, `cfg.get("key", default)`. | +| `Setup` | The master configuration object. Built once per `(analysis, period)`; holds the merged datasets, processes, the physics model and more. | +| `PhysicsModel` | Classifies each process as **background**, **signal** or **data**, and expands *meta-processes* (parameterised families, e.g. all resonant masses). | + +`Setup` is a **singleton**: anywhere in the code, `Setup.getGlobal()` returns the one instance for +the current run, so all tasks see a consistent configuration. + +## The key configuration files + +| File | Lives in | Holds | +|---|---|---| +| `user_custom.yaml` | analysis `config/` | **Your** personal, uncommitted settings: storage, model, options. [Guide](../configuration/user-custom.md). | +| `global.yaml` | both layers | Global settings: anaTuple/histTuple definitions, corrections, payload producers, signal types. | +| `datasets.yaml` | per-era, both layers | Dataset (sample) definitions. [Guide](../configuration/datasets.md). | +| `processes.yaml` | analysis `config/` | Logical processes built from datasets. [Guide](../configuration/processes-and-models.md). | +| `phys_models.yaml` | analysis `config/` | Which processes are background/signal/data for a model. [Guide](../configuration/processes-and-models.md). | +| `crossSections13p6TeV.yaml` | `FLAF/config/` | Cross-section values referenced by datasets. | + +!!! tip "Validate your config without running the pipeline" + Loading `Setup.py` for an era is exactly what the `test-setup-loading` CI check does — it + catches typos and missing references early. You can do the same locally by constructing the + setup for an era; if it loads, the config is internally consistent. + +## `user_custom.yaml` is part of the merge too + +Your `config/user_custom.yaml` overlays the merged configuration with personal values (storage +locations, `phys_model`, options like `compute_unc_variations`). For a single run you can layer an +*extra* file on top with `--user-custom `, which is loaded last and therefore wins — handy +for one-off tests without editing your committed file. See +[`user_custom.yaml`](../configuration/user-custom.md#per-run-overrides-user-custom). diff --git a/docs/concepts/data-flow.md b/docs/concepts/data-flow.md new file mode 100644 index 000000000..564ed7014 --- /dev/null +++ b/docs/concepts/data-flow.md @@ -0,0 +1,66 @@ +# Data flow + +This page follows the data through the pipeline: what each stage consumes, what it produces, and +how the products feed the next stage. It is the conceptual companion to the hands-on +[walkthrough](../workflow/walkthrough.md). + +## The pipeline at a glance + +```mermaid +flowchart TD + NANO[(CMS NanoAOD
DAS / WLCG)] + LIST[Input file lists] + ANA[anaTuples
analysis ntuples] + HTUP[histTuples
+ analysis observables] + HIST[Histograms
per process, per systematic] + PLOT[Plots] + STAT[Limits, scans,
pulls & impacts] + + NANO -->|InputFileTask| LIST + LIST -->|AnaTupleFileTask| ANA + ANA -->|AnaTupleMergeTask| ANA + ANA -->|HistTupleProducerTask| HTUP + HTUP -->|HistFromNtupleProducerTask| HIST + HIST -->|HistMergerTask| HIST + HIST -->|HistPlotTask| PLOT + HIST -->|StatInference + inference| STAT +``` + +## Stage by stage + +| Stage (task) | Consumes | Produces | +|---|---|---| +| **InputFileTask** | A DAS query for the requested datasets and era. | The concrete list of NanoAOD files to process. Runs first and cheaply; everything else keys off it. | +| **AnaTupleFileTask** | One NanoAOD file (one branch per file). | One **anaTuple**: a slimmed/skimmed analysis ntuple with the objects, weights and flags the analysis needs. Runs inside CMSSW via `AnaProd/anaTupleProducer.py`. | +| **AnaTupleMergeTask** | The per-file anaTuples for a dataset. | One merged anaTuple per dataset (data merged across runs). | +| **HistTupleProducerTask** | Merged anaTuples. | **histTuples**: ntuples with the heavier analysis **observables** computed (the "payload producers"). | +| **HistFromNtupleProducerTask** | histTuples. | **Histograms** of the requested variables, including systematic variations. Branches over variables. | +| **HistMergerTask** | Per-piece histograms. | Merged histograms per process, ready for plotting and fitting. | +| **HistPlotTask** | Merged histograms. | **Plots** (one branch per variable). | +| **Statistical inference** | Merged histograms / shapes. | Datacards, exclusion limits, likelihood scans, pulls & impacts (via `StatInference` and the `inference`/`dhi` combine tooling). | + +!!! note "Two helper tasks you will also see" + Some analyses (notably HH→bb̄WW) insert **`AnalysisCacheTask`** and + **`AnalysisCacheAggregationTask`** to pre-compute and aggregate per-event payloads (e.g. the + b-tag shape weights) before histogramming. They are part of the same graph and run + automatically when required. See the [Task reference](../reference/tasks.md). + +## Where the outputs live + +Each output type is written to a **named filesystem** (`fs_*`) that you configure — typically +grid/EOS storage for the big ntuples and histograms, and a local `data/` area for small +artifacts. The mapping and how to set it is covered in [Storage & filesystems](storage.md) and +the [`user_custom.yaml` guide](../configuration/user-custom.md). The practical consequence: + +- Large products (anaTuples, histTuples, histograms) persist on shared storage, so collaborators + — and the next stage — can reuse them without recomputing. +- Because LAW skips tasks whose output already exists, **the pipeline is incremental**: re-running + a late stage only computes what is genuinely missing. + +## Versions keep productions apart + +Every output path includes the `--version` you chose. Two runs with different versions never +collide, which is how parallel productions, personal tests and official productions coexist on the +same storage. The per-task `---version` overrides let one run *read* an existing +upstream production while *writing* its own downstream outputs under a new version — see +[Command arguments](../workflow/arguments.md#per-task-version-overrides). diff --git a/docs/concepts/environment.md b/docs/concepts/environment.md new file mode 100644 index 000000000..195330c58 --- /dev/null +++ b/docs/concepts/environment.md @@ -0,0 +1,93 @@ +# The environment + +`source env.sh` (from an analysis checkout) builds and activates everything FLAF needs. This page +explains what that environment contains, the variables it sets, and the few sharp edges to avoid. + +## What `env.sh` sets up + +The analysis `env.sh` sets `ANALYSIS_PATH` and `FLAF_PATH`, then hands off to `FLAF/env.sh`, which: + +1. **Activates `flaf_env`** — a Python virtual environment built from the CVMFS `LCG_108a` stack + (`x86_64-el9-gcc15-opt`), under `soft/flaf_env`. This provides Python, ROOT and the FLAF + dependencies, and registers the `law` command with tab-completion. +2. **Provides CMSSW** — installs/uses `CMSSW_16_0_6` (compiler `gcc13`) under `soft/`. The ntuple + production stages run inside it. +3. **Provides Combine** — builds standalone + [Combine](https://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/) `v10.4.2` for statistical + inference, and (for HH analyses) wires up the `inference`/`dhi` tooling. +4. **Sets up grid access** — points `X509_USER_PROXY` at `data/voms.proxy` and initialises Rucio. +5. **Defines the `cmsEnv` helper** (see below). + +!!! note "First source is slow, the rest are fast" + The CMSSW and Combine builds happen only on the first `source env.sh`. After that it is a + quick activation. You must source it **once in every new shell**. + +## Key environment variables + +| Variable | Meaning | +|---|---| +| `ANALYSIS_PATH` | The analysis checkout (set by the analysis `env.sh`). | +| `FLAF_PATH` | The FLAF code in use. Defaults to `$ANALYSIS_PATH/FLAF`; override to develop FLAF (below). | +| `CORRECTIONS_PATH` | The Corrections code in use. Defaults to `$ANALYSIS_PATH/Corrections`. | +| `ANALYSIS_SOFT_PATH` | Where the built software lives (`$ANALYSIS_PATH/soft`). | +| `FLAF_ENVIRONMENT_PATH` | The `flaf_env` virtual environment (`$ANALYSIS_SOFT_PATH/flaf_env`). | +| `FLAF_CMSSW_BASE` | The CMSSW area used by the pipeline. | +| `FLAF_COMBINE_PATH` | The standalone Combine build. | +| `ANALYSIS_DATA_PATH` | The local `data/` working area. | +| `X509_USER_PROXY` | Your VOMS proxy (`data/voms.proxy`). | +| `LAW_HOME` / `LAW_CONFIG_FILE` | LAW's home (`.law`) and config (`config/law.cfg`). | +| `FLAF_NO_INSTALL` | When `1`, `env.sh` refuses to build anything (used on batch workers). | + +## `cmsEnv`: running inside CMSSW + +Some commands must run inside the CMSSW runtime. The `cmsEnv` alias runs a command in a clean +shell with just the CMSSW variables set: + +```sh +cmsEnv python3 my_cmssw_script.py +cmsEnv /bin/zsh # an interactive CMSSW subshell +``` + +You will see it most often around statistical-inference commands that call `combine`. + +## Developing shared submodules + +`FLAF` and `Corrections` are pinned **submodules** inside the analysis. If you edit the framework +in place and run the pipeline, your edits may be ignored — because the run uses the submodule copy. +The environment solves this cleanly: `FLAF_PATH` and `CORRECTIONS_PATH` are **inputs** to +`env.sh`. If they are already set when you source it, they are respected; otherwise they default to +the submodule copies. + +So to run against an edited copy of FLAF, set `FLAF_PATH` to that copy **before** sourcing: + +```sh +export FLAF_PATH=/path/to/your/edited/FLAF +source env.sh # everything downstream now uses the edited FLAF +``` + +Everything derived from it — `PYTHONPATH`, the code shipped in batch bundles, the worker bootstrap +— follows automatically. When `FLAF_PATH`/`CORRECTIONS_PATH` differ from the submodule copy, +`env.sh` also enables `PYTHONSAFEPATH` and prepends the right parent directory so the edited copy +wins for `import FLAF` / `import Corrections` (which are namespace packages). On HTCondor, non-bundle +jobs receive these paths (the AFS area is mounted on workers); bundle jobs ship the edited code +inside the tarball instead. See [Running on HTCondor](../workflow/htcondor.md) and +[Contributing](../contributing.md). + +## Sharp edges + +!!! danger "Do not strip `LD_LIBRARY_PATH` (`env -i`)" + Running the environment under `env -i` (a fully empty environment) removes `LD_LIBRARY_PATH`, + which ROOT/cling needs — you get cryptic library/JIT failures. If you must launch a clean + background shell, preserve `LD_LIBRARY_PATH` (and `HOME`, `PATH`). + +!!! danger "Do not source via `bash -c \"source env.sh\"`" + `env.sh` locates itself through `BASH_SOURCE`/`$0`. Sourcing it inside `bash -c "..."` breaks + that detection and sets the wrong `ANALYSIS_PATH`. Source it directly in your shell, or put the + commands in a script file and run that script. + +!!! warning "One environment per shell — beware cross-analysis contamination" + The environment caches paths in variables (`FLAF_PATH`, `ANALYSIS_SOFT_PATH`, …). Sourcing a + *second* analysis's `env.sh` in the same shell, or reusing a shell that already has another + analysis's variables, can pick up the wrong `flaf_env`. Use a fresh shell per analysis. When + scripting background runs, unset the `FLAF_*`/`ANALYSIS_*` variables first (see + [Troubleshooting](../troubleshooting.md#cross-analysis-environment-contamination)). diff --git a/docs/concepts/eras.md b/docs/concepts/eras.md new file mode 100644 index 000000000..58fac3c90 --- /dev/null +++ b/docs/concepts/eras.md @@ -0,0 +1,60 @@ +# Eras & periods + +Every run targets one **era** (also called a **period**), passed as `--period`. An era is a CMS +data-taking period; choosing one selects the matching datasets, corrections and NanoAOD version. + +## Run 3 eras (current) + +| `--period` | Description | √s | NanoAOD | +|---|---|---|---| +| `Run3_2022` | 2022, pre-ECAL repair | 13.6 TeV | v12 | +| `Run3_2022EE` | 2022, post-ECAL repair ("EE") | 13.6 TeV | v12 | +| `Run3_2023` | 2023, pre-BPix | 13.6 TeV | v13 | +| `Run3_2023BPix` | 2023, post-BPix install | 13.6 TeV | v13 | +| `Run3_2024` | 2024 | 13.6 TeV | v15 | +| `Run3_2025` | 2025 (future) | 13.6 TeV | — | + +## Run 2 eras (legacy) + +`Run2_2016_HIPM`, `Run2_2016`, `Run2_2017`, `Run2_2018` (13 TeV). Still defined, but new +development targets Run 3. + +## Why the split into sub-eras? + +The detector and its calibration change *within* a year, so CMS treats those segments as separate +eras for analysis: + +- **2022** splits at the ECAL endcap repair → `Run3_2022` (before) and `Run3_2022EE` (after). +- **2023** splits at the pixel-detector "BPix" installation → `Run3_2023` and `Run3_2023BPix`. + +Each sub-era has its own corrections and luminosity, which is exactly why the +[configuration system](configuration.md) has a **per-era layer**: `FLAF/config//` and +`/config//` carry the era-specific datasets and overrides. + +## What an era controls + +- **Datasets** — `config//datasets.yaml` lists the samples available for that era, including + the correct NanoAOD version path on DAS. +- **NanoAOD version** — the table above; the dataset entries point at the right `vNN` campaign. +- **Corrections** — pileup, b-tagging, trigger and other scale factors are era-specific. +- **Signals** — resonant/non-resonant signals exist for some eras and not others (for instance, + several signal families are not produced for `Run3_2024`). + +## Running several eras + +A task runs **one era at a time**. To cover multiple eras, launch the task once per era (often +scripted), or, in CI, list them in the `*_eras` variable (e.g. +`Run3_2022 Run3_2022EE Run3_2023 Run3_2023BPix`, or `ALL`). See the +[integration pipeline](../ci/integration-pipeline.md). + +!!! warning "`--period` must match an existing era directory" + If you pass an era that has no `config//` (or whose datasets are not defined), config + loading fails — and if a run unexpectedly drops into `InputFileTask` and queries DAS for + nothing, a wrong `--period`/`--version` combination is the usual cause. See + [Troubleshooting](../troubleshooting.md). + +## Adding a new era + +Adding an era means creating its per-era config directories in both the framework and the +analysis, wiring it into the CI era lists, and listing it in `test-setup-loading`. The full +procedure is in [Datasets](../configuration/datasets.md#adding-a-new-era). diff --git a/docs/concepts/storage.md b/docs/concepts/storage.md new file mode 100644 index 000000000..45ef36809 --- /dev/null +++ b/docs/concepts/storage.md @@ -0,0 +1,77 @@ +# Storage & filesystems + +FLAF reads CMS data from the grid and writes large outputs (ntuples, histograms) to grid/EOS +storage, while keeping small artifacts locally. It abstracts every location behind a **named +filesystem** — a `fs_*` key you set in [`user_custom.yaml`](../configuration/user-custom.md). + +## Named filesystems (`fs_*`) + +Each output type has a filesystem name. You only have to set `fs_default`; the others fall back to +it when unset, so a one-line configuration is enough to get going. + +| Key | Used for | +|---|---| +| `fs_default` | The fallback for everything below. **The one key you must set.** | +| `fs_anaTuple` | Merged analysis ntuples (anaTuples). | +| `fs_anaCacheTuple` | Cached per-event payloads. | +| `fs_HistTuple` | histTuples (ntuples with analysis observables). | +| `fs_plots` | Plot outputs. | +| `fs_nanoAOD` | Location of NanoAOD inputs for **custom/local** samples (not from DAS). | +| `fs_das` | The DAS-backed filesystem used to resolve official datasets. | + +!!! tip "Start with just `fs_default`" + Set `fs_default` to your personal storage and leave the rest unset. Everything then lands in + one place, namespaced by `--version` and era. Split outputs across sites later, when you need + to (e.g. point `fs_anaTuple` at a Tier-2/Tier-3 with lots of space). + +## How to write a location + +A filesystem value is a storage URL (or a list of them). Two common forms: + +```yaml +# EOS via WebDAV (your CERNBox / EOS user area): +fs_default: davs://eoshome-k.cern.ch:8444/eos/user/k/kandroso/FLAF/HH_bbtautau/ + +# A WLCG site (Tier-3/Tier-2) by its name + logical path: +fs_anaTuple: T3_CH_CERNBOX:/store/user//HH_bbtautau/ +``` + +- `davs://…` is a direct WebDAV endpoint (good for your EOS user area). +- `T3_CH_CERNBOX:/store/...` names a registered WLCG site and a logical path under it; FLAF/LAW + resolves it to a real endpoint. `T3_CH_CERNBOX` (CERNBox) and `T3_US_FNALLPC` (FNAL LPC) are + common choices. +- A **local absolute path** (e.g. `/builds/.../output/HH_bbtautau`) is also valid — CI uses one so + its outputs stay on the runner. + +A value may also be a **list**, in which case the entries are tried in order — useful for +read-fallback across mirrors. + +## Local working area: `data/` + +Independently of the `fs_*` storage, each analysis checkout has a `data/` directory used for: + +- your **VOMS proxy** (`data/voms.proxy`, where `X509_USER_PROXY` points), +- LAW job files and logs, +- small local copies of outputs. + +This lives in your AFS checkout, not on grid storage. + +## The VOMS proxy is your storage key + +All grid/EOS access uses your **VOMS proxy**. If it expires, reads and writes to `fs_*` locations +fail — often with confusing "permission denied" or "file not found" messages. Refresh it with +`voms-proxy-init -voms cms -rfc -valid 192:00` (see [Installation](../getting-started/installation.md)). + +## A caveat worth knowing: read-after-write lag on EOS + +EOS is eventually consistent. A file you just **wrote** can be briefly **invisible** to a +subsequent existence check (seconds, occasionally longer). FLAF tolerates this in normal +operation, but if you script your own existence checks against freshly written outputs, probe with +a directory listing and a short retry rather than a single `exists()`. See +[Troubleshooting](../troubleshooting.md#eos-read-after-write-lag). + +## Keeping I/O off shared production areas + +When testing, point `fs_default` at *your own* area (and use a personal `--version`) so you never +write into a shared production path. The CI does the inverse — it points `fs_default` at the local +runner so a test never touches real storage at all. diff --git a/docs/concepts/tasks-and-law.md b/docs/concepts/tasks-and-law.md new file mode 100644 index 000000000..c1c4dc903 --- /dev/null +++ b/docs/concepts/tasks-and-law.md @@ -0,0 +1,90 @@ +# Tasks & LAW + +FLAF expresses the whole analysis as a set of **tasks** wired together by their dependencies, and +runs them with [LAW](https://github.com/riga/law) (on top of +[Luigi](https://luigi.readthedocs.io/)). You do not need to know Luigi to use FLAF, but a working +mental model of "tasks" pays off immediately. + +## What is a task? + +A **task** is one stage of work with three things defined: + +- **outputs** — the file(s) it produces (`output()`), +- **requirements** — the other tasks it depends on (`requires()`), +- a **run** step — what it actually does to turn inputs into outputs. + +The crucial property: **a task that already has its output is considered done.** LAW checks for +the output file; if it exists, the task is skipped. This makes the pipeline *resumable* — re-run a +late stage and only the missing upstream pieces are computed. + +## You request the end; LAW fills in the middle + +You almost never run the intermediate stages by hand. You ask for the task whose result you want, +and LAW walks the dependency graph and runs whatever is missing, in order: + +```sh +law run FLAF.Analysis.tasks.HistPlotTask --version v1 --period Run3_2022 +``` + +Even though plotting is the *last* stage, this single command will (if needed) resolve input +files, produce and merge ntuples, compute observables and fill histograms first. The dependency +graph for the analysis is, in order: + +```mermaid +flowchart LR + IFT[InputFileTask] --> ATF[AnaTupleFileTask] --> ATM[AnaTupleMergeTask] + ATM --> HTP[HistTupleProducerTask] --> HFN[HistFromNtupleProducerTask] + HFN --> HM[HistMergerTask] --> HP[HistPlotTask] +``` + +Every box is documented in the [Task reference](../reference/tasks.md), and the same chain is +walked through with commands in the [full-workflow walkthrough](../workflow/walkthrough.md). + +## Workflows and branches + +Most FLAF tasks are **workflows**: they split into many independent **branches** that can run in +parallel. What a branch *is* depends on the task: + +- `AnaTupleFileTask` has **one branch per input NanoAOD file**. +- `HistPlotTask` has **one branch per variable** to plot. +- merge tasks have one branch per dataset/process. + +Two arguments control workflows: + +- `--workflow local` runs branches on the current machine; `--workflow htcondor` submits them to + the [batch system](../workflow/htcondor.md). +- `--branches 0,2,5-7` runs only selected branches (great for testing one file or one variable). + +!!! tip "`--branches 0` does not mean 'a tiny run of everything'" + `--branches` only restricts the *task you launched*. Its upstream dependencies still run for + everything they need. For example `HistPlotTask --branches 0` plots one variable, but the + ntuples and histograms it needs are still produced for all datasets. To make a run genuinely + small, combine it with `--test` (few events) and `phys_model: TestModel` (few processes). + +## Inspecting and cleaning up + +LAW gives you commands to see and manage the state of the graph without running it: + +| Command | What it does | +|---|---| +| `--print-status N,K` | Show the status of the dependency tree to *task depth* `N` and *file-collection depth* `K`. `--print-status 3,1` is a good default. The output also reveals each output's path. | +| `--print-deps N` | Print the dependency tree to depth `N` without checking outputs. | +| `--remove-output N,a,y` | Remove this task's outputs to depth `N`. `a` = all branches, `y` = no confirmation. Use to force a redo. | +| `--parallel-jobs M` | Cap the number of branches running at once (e.g. `--parallel-jobs 100`). Strongly recommended for large local or batch runs. | + +!!! warning "`--remove-output` deletes files" + It removes real outputs (including on grid storage). Double-check the depth and the version + before confirming, especially in a shared production area. + +## Where do the common options come from? + +`--version`, `--period`, `--workflow`, `--branches`, `--test`, `--customisations`, `--process`, +`--user-custom` and the per-task `--*-version` overrides are defined on FLAF's base task classes +(in `FLAF/run_tools/law_customizations.py`), so they are available on **every** FLAF task. They +are all catalogued in [Command arguments](../workflow/arguments.md). + +## When to re-index + +LAW maintains an index of available tasks. Re-run `law index --verbose` after you **add, rename +or move** a task class, or you will get "task not found" errors. Simply editing the body of an +existing task does not require re-indexing. diff --git a/docs/configuration/datasets.md b/docs/configuration/datasets.md new file mode 100644 index 000000000..91de612e6 --- /dev/null +++ b/docs/configuration/datasets.md @@ -0,0 +1,84 @@ +# Datasets + +A **dataset** is one CMS sample — a simulated signal/background, or a chunk of real data — +identified by its DAS name. Datasets are declared per era in `datasets.yaml` files. Thanks to the +[configuration merge](../concepts/configuration.md#how-values-combine), the lists from the +framework and the analysis are concatenated, so all datasets for an era are available together. + +## The split rule (where a dataset belongs) + +| Kind of sample | Goes in | +|---|---| +| SM background, real data | `FLAF/config//datasets.yaml` (framework — common to all analyses) | +| Signal, custom/CI sample | `/config//datasets.yaml` (analysis-specific) | + +This keeps the common SM samples in one shared place while each analysis owns its signals. + +## Dataset entry format + +```yaml +DatasetName: + generator: powheg # madgraph, powheg, pythia, ... + mass: 125 # optional (signals) + spin: 0 # optional (resonant signals) + crossSection: 1pb # a value, or a key into crossSections13p6TeV.yaml + nanoAOD: + v12: # NanoAOD campaign tag (matches the era) + - /DAS/path/to/dataset/NANOAODSIM + v15: + - /DAS/path/to/dataset/NANOAODSIM +``` + +For **custom/local** samples (e.g. CI test inputs) that are not official DAS datasets, point at +your own storage instead: + +```yaml +custom_CI: + generator: powheg + mass: 125 + spin: 0 + crossSection: 1pb + fs_nanoAOD: T3_CH_CERNBOX:/store/user// + dirName: "directory_name" +``` + +## Cross-sections + +MC datasets reference a **cross-section**, either inline (`crossSection: 1pb`) or by a key into +`FLAF/config/crossSections13p6TeV.yaml` (13.6 TeV; `crossSections13TeV.yaml` for Run 2). For +signals whose normalisation is set elsewhere, a placeholder such as `1pb` is conventional. + +## Adding a dataset + +1. **Choose the file** by the split rule above. +2. **Add the entry** in the right era's `datasets.yaml`, following the format. +3. Make sure the `crossSection` resolves — add it to `crossSections13p6TeV.yaml` if needed. +4. For **Run3_2024 signals**, check the actual DAS name: the naming changed (e.g. VBF drops the + `_fixedTauDecays` suffix and uses a `_Par-` form). The conventions are summarised in the + analysis docs and the project notes. +5. **Validate** (below). + +## Validate the dataset config + +The same check the CI runs (`ds-consistency-check`) verifies that MC entries have a generator and a +resolvable cross-section, that names are well-formed, etc.: + +```sh +python3 test/checkDatasetConfigConsistency.py \ + --exception config/dataset_exceptions.yaml \ + Run3_2022 Run3_2022EE Run3_2023 Run3_2023BPix Run3_2024 Run3_2025 +``` + +Run it after editing any `datasets.yaml`. Known, intentional exceptions live in +`config/dataset_exceptions.yaml`. See [CI / GitHub Actions](../ci/github-actions.md). + +## Adding a new era + +1. Create `FLAF/config//` with at least `datasets.yaml` and `global.yaml`. +2. Create `/config//` with the analysis-specific overrides and signals. +3. Add the era to `test-setup-loading.yaml` in each affected analysis (so CI loads `Setup.py` for + it and catches config errors early). +4. Add the era to the `*_eras` variable in the relevant `.github/integration_cfg.yaml` if it + should be part of CI runs. See [Integration pipeline](../ci/integration-pipeline.md). + +See also [Eras & periods](../concepts/eras.md). diff --git a/docs/configuration/processes-and-models.md b/docs/configuration/processes-and-models.md new file mode 100644 index 000000000..1f9a0cb74 --- /dev/null +++ b/docs/configuration/processes-and-models.md @@ -0,0 +1,76 @@ +# Processes & physics models + +[Datasets](datasets.md) are individual CMS samples. A **process** is the *physics* object you +actually plot and fit — one or more datasets grouped together (e.g. "TT", "DY", "signal"). A +**physics model** then declares which processes count as background, signal or data. Both are +analysis-specific config. + +## `processes.yaml` — logical processes + +```yaml +ProcessName: + dataset_names: + - DatasetName1 + - DatasetName2 + processor: ProcessorClass # optional custom processor + subProcesses: # optional composition from other processes + - SubProcessName1 +``` + +A process gathers the datasets that represent the same physics, optionally composes other +processes (`subProcesses`), and optionally names a custom `processor`. + +### Meta-processes + +A **meta-process** is a *template* that expands into a family of concrete processes — for example +"the resonant signal at every mass point" — instead of writing each one out. It is marked with +`is_meta_process: true` and expands at setup time (the [`PhysicsModel`](../concepts/configuration.md) +performs the expansion, substituting parameters such as the mass). + +!!! info "Meta-processes are selectable directly" + You can target a meta-process by name (e.g. `--process custom_CI_Signal`); FLAF expands it to + its concrete member(s) for the requested era. This is what the CI uses for its signal test. + +## `phys_models.yaml` — what is signal vs background vs data + +```yaml +ModelName: + backgrounds: + - ProcessName1 + - ProcessName2 + signals: + - SignalProcessName + data: + - DataProcessName +``` + +A model is just a named partition of processes into the three roles. Which model a run uses is set +by `phys_model` in [`user_custom.yaml`](user-custom.md) (or `--model`). + +### `TestModel` vs the production model + +- **`TestModel`** — a deliberately small set of processes, so the whole pipeline runs fast + end-to-end. Use it for development, local testing and CI. +- **`BaseModel`** (or the analysis's named production model) — the full set used for real results. + +!!! tip "Process names differ slightly between analyses" + The CI process names are capitalised in the HH analyses (`custom_CI_Signal`, + `custom_CI_Background`, `custom_CI_Data`) and lower-case in H→μμ (`custom_CI_signal`, + `custom_CI_background`, `custom_CI_data`). Use the exact name from that analysis's + `processes.yaml`. + +## How processes relate to the rest + +```mermaid +flowchart LR + DS[datasets.yaml
CMS samples] --> PR[processes.yaml
physics groupings] + PR --> PM[phys_models.yaml
bkg / signal / data] + PM -->|phys_model in user_custom| RUN[a run] +``` + +- A **dataset** is a file set on DAS. +- A **process** groups datasets into physics. +- A **model** labels processes as background/signal/data and is what a run actually uses. + +See the [configuration system](../concepts/configuration.md) for how these files are loaded and +merged, and each analysis's docs for its concrete processes and models. diff --git a/docs/configuration/user-custom.md b/docs/configuration/user-custom.md new file mode 100644 index 000000000..16e122ac2 --- /dev/null +++ b/docs/configuration/user-custom.md @@ -0,0 +1,77 @@ +# `user_custom.yaml` + +`config/user_custom.yaml` holds **your personal, uncommitted settings** — where your outputs go, +which physics model to use, and a handful of options. It is loaded on top of the merged +[configuration](../concepts/configuration.md), so it overrides the defaults for *your* runs without +changing anything for anyone else. It is git-ignored: it never gets committed. + +## A minimal file to get started + +```yaml +# Where outputs go (your EOS / CERNBox user area): +fs_default: davs://eoshome-.cern.ch:8444/eos/user///FLAF/HH_bbtautau/ + +# Use the small, fast set of processes while testing: +phys_model: TestModel + +# Standard options: +analysis_config_area: config +compute_unc_variations: true +compute_unc_histograms: true +store_noncentral: true +``` + +Replace ``/`` with yours (e.g. `k` / `kandroso`). With just this, you can run the +[first-run smoke test](../getting-started/first-run.md). + +## Fields + +| Field | Type | Meaning | +|---|---|---| +| `fs_default` | string or list | **Required.** Default storage for all outputs. The fallback for every other `fs_*`. See [Storage](../concepts/storage.md). | +| `fs_anaTuple`, `fs_HistTuple`, `fs_anaCacheTuple`, `fs_plots`, … | string/list | Optional per-output-type storage. Unset ⇒ uses `fs_default`. | +| `phys_model` | string | Which [physics model](processes-and-models.md) to run: `TestModel` (small, for testing/CI) or the analysis's production model (e.g. `BaseModel`). | +| `analysis_config_area` | string | The analysis config directory, relative to the checkout — normally `config`. | +| `compute_unc_variations` | bool | Whether to compute systematic (up/down) variations during production. | +| `compute_unc_histograms` | bool | Whether to also fill histograms for those variations. | +| `store_noncentral` | bool | Whether to keep the non-central (systematic-shift) outputs, not just the central one. | +| `variables` | list | Restrict which variables are produced/plotted. Omit for the full set. | + +!!! tip "`TestModel` is the fast path" + `TestModel` selects a reduced set of processes so the pipeline runs quickly end-to-end. Use it + for development and local testing; switch to the production model only when you need full + results. This is exactly what CI does. + +## A production-style example + +```yaml +fs_default: davs://eoshome-k.cern.ch:8444/eos/user/k/kandroso/FLAF/HH_bbtautau/ +# A separate, roomier site for the big ntuples: +fs_anaTuple: T3_US_FNALLPC:/store/user/lpcflaf/HH_bbtautau/ + +phys_model: BaseModel +analysis_config_area: config +compute_unc_variations: true +compute_unc_histograms: true +store_noncentral: true +``` + +## Per-run overrides (`--user-custom`) + +To change settings for a **single run** without editing your committed file, pass an extra YAML +with `--user-custom`. It is loaded *after* `user_custom.yaml`, so its values win: + +```sh +law run FLAF.Analysis.tasks.HistPlotTask \ + --version my_test --period Run3_2022 --workflow local --branches 0 --test 1000 \ + --user-custom /path/to/extra.yaml +``` + +The path may be absolute or relative to `$ANALYSIS_PATH`. This is the preferred way to run one-off +variants (a different model, a different storage area, a short `variables:` list) — it keeps your +`user_custom.yaml` clean and is reproducible. + +!!! note "The CI uses a dedicated file" + The integration pipeline supplies its own `ci_custom.yaml` (local storage, `TestModel`, a short + `variables:` list) instead of a personal file, so tests never touch real storage. See + [Integration pipeline](../ci/integration-pipeline.md). diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..1bb44d5e8 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,88 @@ +# Contributing + +How to make a change to FLAF (or an analysis) and get it merged. The same workflow applies to the +shared submodules (Corrections, StatInference). + +## Branch, don't commit to `main` + +Always work on a **topic branch** and open a pull request — never commit directly to `main`. + +```sh +git checkout -b my-short-topic-name +# ... make changes ... +git commit -m "short, clear one-line description" +git push origin my-short-topic-name # then open a PR on GitHub +``` + +If your change spans repositories (e.g. FLAF **and** an analysis), use the **same branch name** in +each affected repo so reviewers can find the matching pieces. + +## Format before every commit + +Formatting is CI-enforced ([GitHub Actions](ci/github-actions.md)). Apply all formatters at once +(with `flaf_env` active): + +```sh +source env.sh +bash run_tools/apply_format.sh +``` + +This runs black (Python), clang-format (C++) and yamllint (YAML). You can also run them on +individual files — see [GitHub Actions](ci/github-actions.md#passing-the-checks-before-you-push). + +## Re-index after adding a task + +If you added, renamed or moved a LAW task class, refresh the index so it can be found: + +```sh +law index --verbose +``` + +## Validate config changes + +- Edited `datasets.yaml`? Run the + [consistency check](configuration/datasets.md#validate-the-dataset-config). +- Added an era or changed config loading? Make sure `Setup.py` still loads for every era (this is + what `test-setup-loading` does in CI). + +## Open the PR and run the checks + +On the pull request: + +1. The GitHub Actions checks (formatting, sanity, setup-loading) run automatically. +2. For a real physics check, ask an authorised user to trigger the full pipeline with a + `@cms-flaf-bot please test` comment — see [Integration pipeline](ci/integration-pipeline.md). + +### Pre-PR checklist + +- [ ] On a topic branch (not `main`) +- [ ] `bash run_tools/apply_format.sh` is clean +- [ ] `law index --verbose` run if you added/renamed a task +- [ ] dataset consistency check run if you touched `datasets.yaml` +- [ ] no binary files staged +- [ ] docs updated if behaviour or interfaces changed (see below) + +## Editing the documentation + +These docs are [MkDocs](https://www.mkdocs.org/) with the +[Material](https://squidfunk.github.io/mkdocs-material/) theme; the sources are the Markdown files +under `docs/` and the navigation is in `mkdocs.yml`. To preview locally: + +```sh +pip install mkdocs-material # once, e.g. in a throwaway venv +mkdocs serve # live preview at http://127.0.0.1:8000 +mkdocs build --strict # what to run before committing: fails on broken links +``` + +`mkdocs build --strict` catches broken internal links, missing nav entries and missing assets — +run it before you commit doc changes. There is a :material-pencil: **edit** action on every page +that takes you straight to the source file on GitHub. + +Guidelines for docs changes: + +- Keep framework-wide material here in FLAF; put analysis-specific material in that analysis's + `docs/` (see [Analyses](analyses.md)). Link rather than duplicate. +- Prefer concrete, copy-pasteable commands, and flag caveats with admonitions + (`!!! warning`, `!!! tip`). +- Remember the audience includes physicists new to the tooling — define terms or link the + [Glossary](glossary.md). diff --git a/docs/getting-started/first-run.md b/docs/getting-started/first-run.md new file mode 100644 index 000000000..3df062a06 --- /dev/null +++ b/docs/getting-started/first-run.md @@ -0,0 +1,88 @@ +# Your first run + +This page walks you through a **minimal end-to-end run**: a single command that exercises the +*whole* FLAF pipeline — from CMS NanoAOD all the way to a histogram plot — on just a handful of +events. It is the same smoke test that the [continuous integration](../ci/integration-pipeline.md) +runs on every change, so if it works for you, your setup is healthy. + +## Before you start + +You need a working environment from [Installation](installation.md): + +```sh +cd HH_bbtautau # or your analysis repository +source env.sh # once per shell +voms-proxy-info # confirm you have a valid proxy +``` + +You also need two settings in `config/user_custom.yaml`: + +- a storage location for outputs (the `fs_default` field), and +- `phys_model: TestModel` — a small, fast subset of processes meant for testing. + +See the [Configuration guide](../configuration/user-custom.md) for a ready-to-copy minimal file. + +## Run it + +```sh +law run FLAF.Analysis.tasks.HistPlotTask \ + --version my_first_run \ + --period Run3_2022 \ + --workflow local \ + --branches 0 \ + --test 1000 +``` + +That one command asks LAW for the final plots. LAW notices that none of the inputs exist yet and +**automatically runs every upstream stage first** — resolving the input file list, producing and +merging analysis ntuples, computing observables, filling and merging histograms — before making +the plot. You do not run the intermediate tasks yourself. + +### What each argument means + +| Argument | Meaning | +|---|---| +| `FLAF.Analysis.tasks.HistPlotTask` | The task you want — here, the plotting task (the end of the chain). | +| `--version my_first_run` | A label for this run. Outputs are grouped under it, so you can keep runs apart. Use any name. | +| `--period Run3_2022` | Which data-taking [era](../concepts/eras.md) to process. | +| `--workflow local` | Run on this machine (not the batch system). Good for testing. | +| `--branches 0` | Only the first work unit. `HistPlotTask` has one branch per variable, so this plots a single variable. | +| `--test 1000` | Process only 1000 events per input file — fast, just to check the machinery. | + +These and many more options are catalogued in [Command arguments](../workflow/arguments.md). + +## What to expect + +- LAW prints a dependency tree and then runs the stages bottom-up. Because the early stages + produce analysis ntuples from CMS NanoAOD (using CMSSW and reading from the grid), **the first + run is not instant** even with `--test 1000` — budget a little time. +- Output is written under the storage you configured in `user_custom.yaml`, organised by version + and era. A local copy of small artifacts also appears under `data/`. +- When the top task finishes, LAW reports success for `HistPlotTask`. + +!!! tip "Check progress without running anything" + In another shell (after `source env.sh`), ask LAW for the status of the dependency tree: + ```sh + law run FLAF.Analysis.tasks.HistPlotTask \ + --version my_first_run --period Run3_2022 --print-status 3,1 + ``` + The numbers are *task depth* and *file-collection depth*. This is the quickest way to see + which stage is done and where its output lives. + +## If it fails + +- **`InputFileTask` keeps appearing / DAS errors** — usually a wrong era/version or an expired + proxy. Re-run `voms-proxy-init`. See [Troubleshooting](../troubleshooting.md). +- **`law: command not found`** — you did not `source env.sh` in this shell. +- **Import errors / empty submodule dirs** — you cloned without `--recursive`; run + `git submodule update --init --recursive`. + +More symptoms and fixes are collected in [Troubleshooting](../troubleshooting.md). + +## Next steps + +You have run the whole pipeline once. Now learn what actually happened: + +- [Tasks & LAW](../concepts/tasks-and-law.md) — what a task is and how LAW chains them. +- [Full workflow walkthrough](../workflow/walkthrough.md) — every stage, in order, with commands. +- [Running on HTCondor](../workflow/htcondor.md) — scale up from `local` to the batch system. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000..a900a9a07 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,112 @@ +# Installation + +## You install an *analysis*, not FLAF itself + +This is the single most important thing to understand about setting up FLAF: + +!!! warning "FLAF is a submodule — never clone it on its own to run an analysis" + FLAF is shared machinery that lives **inside** each analysis repository as a git submodule. + Its `env.sh` deliberately refuses to run unless an analysis has set it up first. To work with + FLAF you clone one of the **analysis repositories**, which brings the right version of FLAF + (and the other shared submodules) with it. + +The available analysis repositories are: + +| Repository | Channel | Clone URL | +|---|---|---| +| [`HH_bbtautau`](https://github.com/cms-flaf/HH_bbtautau) | HH → bb̄ττ | `git@github.com:cms-flaf/HH_bbtautau.git` | +| [`HH_bbWW`](https://github.com/cms-flaf/HH_bbWW) | HH → bb̄WW | `git@github.com:cms-flaf/HH_bbWW.git` | +| [`H_mumu`](https://github.com/cms-flaf/H_mumu) | H → μμ | `git@github.com:cms-flaf/H_mumu.git` | + +The examples below use **HH_bbtautau** (the reference analysis); the steps are identical for the +others. + +## 1. Clone the analysis repository (with submodules) + +```sh +git clone --recursive git@github.com:cms-flaf/HH_bbtautau.git +cd HH_bbtautau +``` + +!!! danger "Do not forget `--recursive`" + FLAF, Corrections, the inference tooling and analysis-specific submodules (e.g. SVfit, + HHKinFit2, HHbtag) are all git submodules. Without `--recursive` you get empty directories and + confusing import errors. If you already cloned without it, run: + ```sh + git submodule update --init --recursive + ``` + +## 2. Set up the environment + +```sh +source env.sh +``` + +Sourcing the analysis's `env.sh` is how you enter the FLAF environment. It: + +1. sets `ANALYSIS_PATH` to the repository and points `FLAF_PATH` at the bundled `FLAF/` submodule; +2. **the first time**, builds everything it needs (this is the slow part): + - a Python virtual environment `flaf_env` (from the CVMFS `LCG_108a` stack) under `soft/`; + - a CMSSW area (`CMSSW_16_0_6`) used by the parts of the pipeline that need CMS software; + - a standalone [Combine](https://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/) + (`v10.4.2`) for statistical inference; +3. activates that environment and registers the `law` command and tab-completion; +4. defines a `cmsEnv` helper for running commands inside CMSSW; +5. points your VOMS proxy location at `data/voms.proxy`. + +!!! note "The first `source env.sh` is slow; later ones are fast" + The initial build compiles CMSSW and Combine and can take **tens of minutes** and a few GB of + disk under `soft/`. It only happens once. Afterwards, `source env.sh` takes a few seconds and + just activates the already-built environment. You must `source env.sh` **once per shell** + (every new terminal). + +??? info "Advanced: skipping the build on worker nodes (`FLAF_NO_INSTALL`)" + Setting `FLAF_NO_INSTALL=1` makes `env.sh` fail instead of building anything if the + environment is missing. This is used on batch workers, where the environment is shipped in a + bundle rather than built on the node. You will not normally set it by hand. See + [The environment](../concepts/environment.md) and [Running on HTCondor](../workflow/htcondor.md). + +## 3. Index the LAW tasks + +LAW needs to discover the available tasks. Run this once after installation (and again whenever +you add or rename a task): + +```sh +law index --verbose +``` + +You should see the FLAF tasks listed (`InputFileTask`, `AnaTupleFileTask`, `HistPlotTask`, …). + +## 4. Get a VOMS proxy + +The pipeline reads and writes grid storage, which needs a short-lived **VOMS proxy**. Create one +(valid here for 8 days): + +```sh +voms-proxy-init -voms cms -rfc -valid 192:00 +``` + +Because `env.sh` sets `X509_USER_PROXY` to `data/voms.proxy`, the proxy is written where FLAF +expects it. Check it any time with: + +```sh +voms-proxy-info +``` + +!!! warning "Expired proxy = mysterious failures" + A common cause of "permission denied" / "file not found" errors on grid storage is simply an + expired proxy. If something that worked yesterday fails today, re-run `voms-proxy-init`. + +## 5. Create your `user_custom.yaml` + +Finally, tell FLAF where *your* outputs should go and which physics model to use, by creating +`config/user_custom.yaml`. This file holds your personal, uncommitted settings (storage paths, +test vs. production model). The [Configuration guide](../configuration/user-custom.md) explains +every field; a minimal file is enough to start. + +## What now? + +- Smoke-test the whole chain on a handful of events: [Your first run](first-run.md). +- Not sure what "task", "era" or "process" mean here? [Key terms](key-terms.md). +- Want the full story? The [Concepts](../concepts/architecture.md) section explains the + architecture and the environment in depth. diff --git a/docs/getting-started/key-terms.md b/docs/getting-started/key-terms.md new file mode 100644 index 000000000..dc2dcb6f8 --- /dev/null +++ b/docs/getting-started/key-terms.md @@ -0,0 +1,36 @@ +# Key terms + +FLAF borrows vocabulary from software-workflow tools (LAW/Luigi) and from CMS computing. If your +background is physics, a few words may be unfamiliar or may mean something slightly different +from what you expect. This page is the quick crosswalk; the [Glossary](../glossary.md) has the +full list. + +| Term | What it means in FLAF | +|---|---| +| **Task** | One stage of the pipeline (e.g. "produce analysis ntuples", "make plots"). A unit of work LAW knows how to run, with defined inputs and outputs. You request a task and LAW runs whatever it depends on. | +| **Workflow** | A task that splits into many independent **branches**. Run it `--workflow local` (on your machine) or `--workflow htcondor` (on the batch system). | +| **Branch** | One independent work unit inside a workflow — for example one input file, one dataset, or one variable. `--branches 0,2,5-7` selects which ones to run. | +| **Era** / **period** | A CMS data-taking period, e.g. `Run3_2022`, `Run3_2023BPix`. Passed as `--period`. Selects which datasets, corrections and NanoAOD version apply. See [Eras](../concepts/eras.md). | +| **Dataset** | A specific CMS sample, identified by its DAS name (a simulated signal/background, or a chunk of real data). | +| **Process** | A *physics* object built from one or more datasets (e.g. "TT", "DY", "signal"). What you actually plot and fit. Defined in `processes.yaml`. | +| **Sample** | Loosely used for "a dataset or a process". When precision matters, prefer *dataset* (CMS file set) vs *process* (physics grouping). | +| **anaTuple** | The analysis-level ntuple FLAF produces from NanoAOD: a slimmed, skimmed ROOT tree with the objects and flags the analysis needs. | +| **histTuple** | A further ntuple with the (heavier) analysis observables computed, ready to be turned into histograms. | +| **Histogram** | A binned distribution of one variable for one process, including its systematic variations. | +| **Version** | A label (`--version`) that namespaces a run's outputs, so different productions or tests don't collide. | +| **Customisation** | An ad-hoc `key=value` override passed via `--customisations` (e.g. `deepTauVersion=2p5`). | +| **Filesystem** (`fs_*`) | A named storage location (local or grid/EOS) where a given output type is read/written. Configured in `user_custom.yaml`. See [Storage](../concepts/storage.md). | +| **Bundle** | A tarball of the code/environment shipped to batch workers so a job can run without your AFS area. See [HTCondor](../workflow/htcondor.md). | +| **Physics model** | The set of processes (backgrounds, signals, data) used for an analysis. `TestModel` is the small set used in testing; production uses the full model. Defined in `phys_models.yaml`. | +| **Corrections** | Scale factors and systematic variations (pileup, b-tagging, triggers, …) applied during ntuple production, from the shared `Corrections` submodule. | +| **LAW / Luigi** | The workflow engine. [Luigi](https://luigi.readthedocs.io/) tracks task dependencies and outputs; [LAW](https://github.com/riga/law) adds the command-line interface, remote storage and batch submission FLAF uses. | +| **CMSSW** | The CMS software framework. Some stages run inside it; FLAF wraps this with the `cmsEnv` helper. | + +!!! tip "The mental model in one sentence" + You ask LAW for the **task** whose output you want, for a given **version** and **period**; + LAW runs the chain of tasks needed, splitting big tasks into **branches** that can go to + **HTCondor**, reading and writing each output to its configured **filesystem**. + +Ready to see the chain in action? Continue to the +[full-workflow walkthrough](../workflow/walkthrough.md), or read +[Tasks & LAW](../concepts/tasks-and-law.md) to understand the engine first. diff --git a/docs/getting-started/prerequisites.md b/docs/getting-started/prerequisites.md new file mode 100644 index 000000000..57ee85aad --- /dev/null +++ b/docs/getting-started/prerequisites.md @@ -0,0 +1,86 @@ +# Prerequisites + +Before installing anything, make sure you have access to the CERN computing infrastructure that +FLAF relies on. If you have run any CMS analysis on `lxplus` before, you almost certainly have +all of this already — skip to [Installation](installation.md). + +## 1. A CERN account and `lxplus` + +FLAF is developed and run on CERN's interactive login service, **`lxplus`**, currently on +**AlmaLinux 9** (`el9`). Connect with: + +```sh +ssh @lxplus.cern.ch +``` + +!!! note "Other machines" + FLAF can run on any machine that provides CVMFS and an `el9` (or compatible) environment, + but `lxplus` is the supported and tested platform. The instructions throughout assume it. + +## 2. CVMFS + +FLAF gets its compilers, Python and ROOT from the CERN software distribution service +[**CVMFS**](https://cvmfs.readthedocs.io/). On `lxplus` it is already mounted. Check that the two +areas FLAF uses are visible: + +```sh +ls /cvmfs/cms.cern.ch # CMS software (CMSSW) +ls /cvmfs/sft.cern.ch # LCG software stacks (Python, ROOT, ...) +``` + +If those directories are empty or missing, CVMFS is not available and FLAF will not work. + +## 3. A grid certificate and CMS VO membership + +The pipeline reads CMS data from the grid (WLCG) and writes to grid storage, so you need a +**grid certificate** installed and to be a member of the **CMS Virtual Organisation (VO)**. + +- Request a grid certificate and join the CMS VO by following the + [CMS computing access guide](https://uscms.org/uscms_at_work/computing/getstarted/get_grid_cert.shtml) + (one-time setup). +- Your certificate (`usercert.pem` / `userkey.pem`) lives in `~/.globus/`. + +You will turn this certificate into a short-lived **VOMS proxy** every time you work — that step +is part of [Installation](installation.md). + +!!! warning "VOMS membership is not instant" + Joining the CMS VO requires approval and can take a day or two. Do this early. + +## 4. SSH keys for GitHub **and** CERN GitLab + +FLAF and the analyses live on **GitHub** (`github.com/cms-flaf/...`); some shared submodules +(the HH `inference` tooling) live on **CERN GitLab** (`gitlab.cern.ch`). Cloning with submodules +pulls from both, so you need an SSH key registered on each: + +- GitHub → [github.com/settings/keys](https://github.com/settings/keys) +- CERN GitLab → [gitlab.cern.ch/-/profile/keys](https://gitlab.cern.ch/-/profile/keys) + +Verify both work: + +```sh +ssh -T git@github.com # should greet you by username +ssh -T git@gitlab.cern.ch # should welcome you +``` + +!!! tip "Why two hosts?" + Most of FLAF is on GitHub. Only the combine-based HH statistical-inference submodule is on + CERN GitLab. If a `git clone --recursive` stalls or fails on the `inference` submodule, a + missing GitLab key is the usual cause. + +## 5. Somewhere to work and to store output + +- **Code** goes in your AFS work area (e.g. `/afs/cern.ch/work///`), which has + more quota than your home directory. The first environment build needs a few GB under the + repository's `soft/` directory (CMSSW + a Python virtual environment). +- **Outputs** (ntuples, histograms) go to grid/EOS storage that you configure in + `user_custom.yaml` — see the [Configuration guide](../configuration/user-custom.md). + +## Checklist + +- [ ] Can `ssh` to `lxplus` (`el9`) +- [ ] `/cvmfs/cms.cern.ch` and `/cvmfs/sft.cern.ch` are populated +- [ ] Grid certificate in `~/.globus/`, member of the CMS VO +- [ ] SSH keys registered on **both** GitHub and CERN GitLab +- [ ] A few GB of free quota in your AFS work area + +All set? Continue to [Installation](installation.md). diff --git a/docs/glossary.md b/docs/glossary.md new file mode 100644 index 000000000..0366ce9af --- /dev/null +++ b/docs/glossary.md @@ -0,0 +1,117 @@ +# Glossary + +Framework and CMS-computing vocabulary, in plain terms. For the quick on-ramp version see +[Key terms](getting-started/key-terms.md). + +**anaTuple** +: The analysis-level ntuple FLAF produces from NanoAOD — a slimmed, skimmed ROOT tree with the + objects, weights and flags an analysis needs. Produced by `AnaTupleFileTask`, merged by + `AnaTupleMergeTask`. + +**AnaProd** +: The part of FLAF (`AnaProd/`) that produces anaTuples from NanoAOD, including the CMSSW-based + `anaTupleProducer.py`. + +**Branch** +: One independent work unit of a workflow task. What it represents depends on the task — an input + file, a dataset, or a variable. Select with `--branches`. + +**Bundle** +: A tarball of code/environment shipped to a batch worker so a job can run without the shared AFS + area. See [HTCondor](workflow/htcondor.md#bundles-shipping-the-code-to-workers). + +**Combine** +: The CMS statistical tool (`HiggsAnalysis/CombinedLimit`) used for limits and fits. FLAF builds a + standalone `v10.4.2`. + +**Corrections** +: The shared submodule providing object corrections and systematic variations (pileup, b-tag, + triggers, …) applied during ntuple production. + +**CMSSW** +: The CMS software framework. Some stages run inside it; FLAF wraps it with the `cmsEnv` helper. + +**CVMFS** +: The CERN read-only software-distribution filesystem (`/cvmfs/…`) from which FLAF gets compilers, + Python, ROOT (LCG stacks) and CMSSW. + +**DAS** +: The CMS Data Aggregation System — the catalogue of official datasets. `InputFileTask` queries it + to turn dataset names into file lists. + +**Dataset** +: One CMS sample (a simulated process or a chunk of data), identified by its DAS name. Declared in + `datasets.yaml`. See [Datasets](configuration/datasets.md). + +**Era** / **period** +: A CMS data-taking period (`Run3_2022`, `Run3_2023BPix`, …), passed as `--period`. Selects + datasets, corrections and the NanoAOD version. See [Eras](concepts/eras.md). + +**Filesystem (`fs_*`)** +: A named storage location (local, EOS or a WLCG site) where a given output type is read/written. + Configured in `user_custom.yaml`. See [Storage](concepts/storage.md). + +**FLAF** +: The Flexible LAW-based Analysis Framework — the shared machinery (tasks, config, environment, CI) + included as a submodule in each analysis. + +**histTuple** +: An ntuple, derived from anaTuples, that carries the computed analysis observables, ready to be + histogrammed. Produced by `HistTupleProducerTask`. + +**HTCondor** +: CERN's batch system. FLAF submits workflow branches to it with `--workflow htcondor`. See + [HTCondor](workflow/htcondor.md). + +**LAW** +: [Luigi Analysis Workflow](https://github.com/riga/law) — the layer over Luigi that gives FLAF its + command-line interface, remote-storage handling and batch submission. + +**Luigi** +: The Python workflow engine that tracks task dependencies and outputs underneath LAW. + +**Meta-process** +: A process template that expands into a family of concrete processes (e.g. all resonant mass + points). Marked `is_meta_process: true`. See [Processes & models](configuration/processes-and-models.md). + +**NanoAOD** +: The compact CMS data format that is the input to the whole pipeline. + +**Payload producer** +: A configured component that computes an analysis observable during `HistTupleProducerTask`. + +**Physics model** +: The named set of processes (background/signal/data) an analysis uses. `TestModel` is the small + testing set; production uses the full model. Defined in `phys_models.yaml`. + +**PlotKit** +: FLAF's plotting-helpers submodule. + +**Process** +: A physics object built from one or more datasets (e.g. "TT", "DY", "signal") — what you plot and + fit. Defined in `processes.yaml`. + +**Proxy (VOMS)** +: A short-lived credential derived from your grid certificate that authorises grid/EOS access. + Created with `voms-proxy-init`; FLAF expects it at `data/voms.proxy`. + +**RunKit** +: Workflow utilities vendored into FLAF as a regular directory (formerly a submodule). Imported as + `FLAF.RunKit.`. + +**StatInference** +: The shared submodule for datacard creation and limit/fit tooling (used by the HH analyses). + +**Task** +: One stage of the pipeline with defined inputs, outputs and a run step. The unit LAW schedules. + See [Tasks & LAW](concepts/tasks-and-law.md) and the [Task reference](reference/tasks.md). + +**Version** +: The `--version` label that namespaces a run's outputs so productions and tests don't collide. + +**WLCG** +: The Worldwide LHC Computing Grid — the federation of sites (`T1_*`, `T2_*`, `T3_*`) where CMS + data and FLAF outputs are stored. + +**Workflow** +: A task that splits into many branches, runnable `local` or on `htcondor`. diff --git a/docs/hh_bbtautau.md b/docs/hh_bbtautau.md deleted file mode 100644 index c82346a5f..000000000 --- a/docs/hh_bbtautau.md +++ /dev/null @@ -1,78 +0,0 @@ -# HH->bb$\tau$$\tau$ analysis steps - -**Commands below assume that AnaTuples have already been produced. If not, please produce them following the instruction in the analysis section.** - -Remember that: - -- `ERA` variable is set. E.g. - ```sh - ERA=Run2_2016 - ``` - Alternatively you can add `ERA=Run2_2016; ...` in front of each command. - Run2 possible eras are: `Run2_2016`,`Run2_2016_HIPM`,`Run2_2017` and `Run2_2018` -
-- when expliciting `VERSION_NAME` variable, its name contains explicitly the deepTau version: `VERSION_NAME= vXX_deepTauYY_ZZZ`, where: - - XX is the anaTuple version (if not the first production it can be useful to have `v1,v2,..`), - - YY is the deepTau version (`2p1` or `2p5`) - - ZZZ are other eventual addition (e.g. if only tauTau channel `_onlyTauTau` or if `Zmumu` ntuples `_Zmumu`..) -
-- `--workflow` can be `htcondor` or `local`. It is recommended to develop and test locally and then switch to `htcondor` for production. In examples below `--workflow local` is used for illustration purposes.

-- when running on `htcondor` it is recommended to add `--transfer-logs` to the command to transfer logs to local.

-- `--customisations` argument is used to pass custom parameters to the task in form param1=value1,param2=value2,... - **IMPORTANT for HHbbTauTau analysis:** if running using deepTau 2p5 add `--customisations deepTauVersion=2p5`

-- if you want to run only on few files, you can specify list of branches to run using `--branches` argument. E.g. `--branches 2,7-10,17`.

-- to get status, use `--print-stauts N,K` where N is depth for task dependencies, K is depths for file dependencies. E.g. `--print-status 3,1`.

-- to remove task output use `--remove-output N,a`, where N is depth for task dependencies. E.g. `--remove-output 0,a`.

-- it is highly recommended to limitate the maximum number of parallel jobs running adding `--parallel-jobs M` where M is the number of the parallel jobs (e.g. M=100) - -## Create anaCacheTuple - -For each Anatuple, an anaCacheTuple (storing observables which are computationally heavier) will be created. - -```sh -law run AnaCacheTupleTask --period ${ERA} --version ${VERSION_NAME} -``` -**Note**: at the `AnaCacheTupleTask` stage, the addition of customisation for specifying the version is still needed. For the other tasks, it won't be needed anymore. - - -#### Merge data in anaCache tuples - -```sh -law run DataCacheMergeTask --period ${ERA} --version ${VERSION_NAME} -``` - - -### Histograms Production - -This has to be run after AnaTupleTask but **not necessairly** after AnaCacheTupleTask, if the variable to plot is not stored inside AnaCacheTuples. - -These task will produce histograms with observables that need to be specified inside the `Analysis/tasks.py` file, specifically inside the `vars_to_plot` list. - -The tasks to run are the following: - -1. `HistProducerFileTask`: for each AnaTuple an histogram of the corresponding variable will be created. - ```sh - law run HistProducerFileTask --period $ERA --version ${VERSION_NAME} - ``` -1. `HistProducerSampleTask`: all the histogram belonging to a specific sample will be merged in one histogram. - ```sh - law run HistProducerSampleTask --period $ERA --version ${VERSION_NAME} - ``` -1. `MergeTask`: all the histogram will be merged from samples to only one histograms under the folder `${HISTOGRAMS}/all_histograms/` to a specific sample will be merged in one histogram. At this stage, for each norm/shape uncertainty (+ central scenario) will be created one histogram. - ```sh - law run MergeTask --period $ERA --version ${VERSION_NAME} - ``` - Each histograms will be named as: `all_histograms_UNCERTAINTY.root` where uncertainty can be [Central, TauES_DM0, ecc....] - -1. `HaddMergedTask`: all the merged histograms (produced separately for each uncertainty) will be merged in only one file. - ```sh - law run HaddMergedTask --period $ERA --version ${VERSION_NAME} - ``` - Tip: It's very fast so it can be convenient to run this task in local. - The final histogram will be named as: `all_histograms_Hadded.root` - -## How to run HHbtag training skim ntuple production -```sh -python Studies/HHBTag/CreateTrainingSkim.py --inFile $CENTRAL_STORAGE/prod_v1/nanoAOD/2018/GluGluToBulkGravitonToHHTo2B2Tau_M-350.root --outFile output/skim.root --mass 350 --sample GluGluToBulkGraviton --year 2018 >& EventInfo.txt -python Common/SaveHisto.txt --inFile $CENTRAL_STORAGE/prod_v1/nanoAOD/2018/GluGluToBulkGravitonToHHTo2B2Tau_M-350.root --outFile output/skim.root -``` diff --git a/docs/index.md b/docs/index.md index 397c4f5cb..25a1c7f50 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,50 +1,87 @@ # FLAF -FLAF - Flexible LAW-based Analysis Framework. -Task workflow managed is done via [LAW](https://github.com/riga/law) (Luigi Analysis Framework). - -## How to install -1. Setup ssh keys: - - On GitHub [settings/keys](https://github.com/settings/keys) - - On CERN GitLab [profile/keys](https://gitlab.cern.ch/-/profile/keys) - -1. Clone the repository: - ```sh - git clone --recursive git@github.com:cms-flaf/Framework.git FLAF - ``` - -1. Create a user customisation file `config/user_custom.yaml`. It should contain all user-specific modifications that you don't want to be committed to the central repository. Below is example of minimal content of the file (replace `USER_NAME` and `ANA_FOLDER` with your values): - ```yaml - fs_default: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/' - fs_anaCache: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/' - fs_anaTuple: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/' - fs_anaCacheTuple: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/' - fs_histograms: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/histograms/' - fs_json: - - 'T3_CH_CERNBOX:/store/user/USER_NAME/ANA_FOLDER/jsonFiles/' - analysis_config_area: config/HH_bbtautau - compute_unc_variations: true - store_noncentral: true - ``` - -## How to load environment -1. Following command activates the framework environment: - ```sh - source env.sh - ``` - -1. For the new installation or after you implement new law tasks, you need to update the law index: - ```sh - law index --verbose - ``` - -1. Initialize voms proxy: - ```sh - voms-proxy-init -voms cms -rfc -valid 192:00 - ``` +**FLAF** — the **F**lexible **LA**W-based Analysis **F**ramework — is the shared software +framework behind several CMS Higgs-sector analyses at CERN. It turns CMS +[NanoAOD](https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookNanoAOD) files into the +analysis ntuples, histograms, plots and statistical results that go into a physics paper. +FLAF organises this work as a chain of **tasks** managed by +[LAW](https://github.com/riga/law) (the Luigi Analysis Workflow). You describe *what* you want +(for example, "the final plots for the 2022 data"); LAW figures out *which* intermediate steps +are needed, runs only those, and can dispatch them to the CERN HTCondor batch system. + +!!! tip "New here? You are in the right place." + These docs assume **no prior experience** with LAW, Luigi or batch computing. If your + background is physics rather than software engineering, start with + [Key terms](getting-started/key-terms.md) and the [Getting started](getting-started/prerequisites.md) + track — every concept is introduced from scratch. + +## The big picture + +FLAF is shared by three analyses — **HH→bb̄ττ**, **HH→bb̄WW** and **H→μμ** — which all run the +same pipeline. From CMS NanoAOD to final results, the stages are: + +```mermaid +flowchart TD + NANO[CMS NanoAOD
on DAS / WLCG] --> IFT[InputFileTask
resolve the file list] + IFT --> ATF[AnaTupleFileTask
produce analysis ntuples] + ATF --> ATM[AnaTupleMergeTask
merge per dataset] + ATM --> HTP[HistTupleProducerTask
compute analysis observables] + HTP --> HFN[HistFromNtupleProducerTask
fill histograms] + HFN --> HM[HistMergerTask
merge histograms] + HM --> HP[HistPlotTask
make plots] + HP --> STAT[Statistical inference
datacards, limits, scans] +``` + +Each box is a LAW **task**. You normally run only the *last* task you care about — LAW pulls in +everything upstream automatically. The whole pipeline is explained step by step in the +[full-workflow walkthrough](workflow/walkthrough.md). + +## How to read these docs + +The documentation is organised so you can enter at the level you need. + +
+ +- :material-rocket-launch: **I'm new — get me running** + + Follow the [Getting started](getting-started/prerequisites.md) track: prerequisites → + installation → your first run. Then skim [Key terms](getting-started/key-terms.md). + +- :material-lightbulb-on: **I want to understand how it works** + + Read [Concepts](concepts/architecture.md): the architecture, what a LAW task is, the data + flow, the configuration system, eras, storage and the environment. + +- :material-cog: **I need to run the analysis** + + Use the [Full workflow](workflow/walkthrough.md) walkthrough and the + [Command arguments](workflow/arguments.md) cheat-sheet. Scale up with + [Running on HTCondor](workflow/htcondor.md). + +- :material-book-open-variant: **I'm looking something up** + + Jump to the [Task reference](reference/tasks.md), the [Configuration guide](configuration/user-custom.md), + the [Glossary](glossary.md) or [Troubleshooting](troubleshooting.md). + +
+ +## What FLAF is — and is not + +- FLAF is a **framework**, not an analysis. It lives in the [`cms-flaf/FLAF`](https://github.com/cms-flaf/FLAF) + repository and is included as a **git submodule** inside each analysis repository. You never + clone FLAF on its own to run an analysis — you clone an analysis repository (which brings FLAF + with it). See [Architecture](concepts/architecture.md) and [Installation](getting-started/installation.md). +- FLAF provides the **common machinery**: the task definitions, the configuration system, the + environment setup, the storage abstraction and the CI. The **physics specifics** (which + signals, which observables, which categories) live in each analysis repository and are + documented there — see [Analyses](analyses.md). + +## Getting help + +- **Something went wrong?** Check [Troubleshooting](troubleshooting.md) first — it collects the + most common pitfalls and their fixes. +- **An unfamiliar word?** The [Glossary](glossary.md) translates framework vocabulary into + analyst terms. +- **Found a docs problem?** Use the :material-pencil: edit icon on any page to open a pull + request, or open an issue on [GitHub](https://github.com/cms-flaf/FLAF/issues). diff --git a/docs/reference/tasks.md b/docs/reference/tasks.md new file mode 100644 index 000000000..f8cac1501 --- /dev/null +++ b/docs/reference/tasks.md @@ -0,0 +1,80 @@ +# Task reference + +A concise reference for every FLAF task: what it does, what it branches over, and its task-specific +parameters. The **common** parameters (`--version`, `--period`, `--workflow`, `--branches`, +`--test`, …) apply to all of them and are documented in [Command arguments](../workflow/arguments.md). + +Production tasks live in `FLAF/AnaProd/tasks.py` (invoke as `FLAF.AnaProd.tasks.`); analysis +tasks live in `FLAF/Analysis/tasks.py` (invoke as `FLAF.Analysis.tasks.`). For the order in +which they run, see the [walkthrough](../workflow/walkthrough.md) and +[data flow](../concepts/data-flow.md). + +## Production tasks (`AnaProd`) + +### `InputFileTask` +Resolves the concrete list of NanoAOD files for the requested datasets and era (from DAS). Runs +locally (it is a `LocalWorkflow`, not submitted to HTCondor) and is cheap. Every downstream task +depends on it, so it runs first. + +### `AnaTupleFileTask` +Runs the analysis producer (`AnaProd/anaTupleProducer.py`, inside CMSSW) over input files to create +**anaTuples**. **Branches over input files** (one branch per NanoAOD file) — the workflow you most +often submit to HTCondor. + +### `AnaTupleFileListBuilderTask` / `AnaTupleFileListTask` +Helper workflows that assemble the lists of per-file anaTuples to be merged. Normally pulled in +automatically as dependencies of the merge step; you rarely call them directly. + +### `AnaTupleMergeTask` +Merges the per-file anaTuples into one anaTuple per dataset (data merged across runs). + +- **Parameter:** `--delete-inputs-after-merge` (bool, default `false`) — remove the per-file + inputs once the merge succeeds, to save space. + +## Analysis tasks (`Analysis`) + +### `HistTupleProducerTask` +Reads merged anaTuples and computes the analysis **observables** (the configured "payload +producers"), writing **histTuples**. + +### `HistFromNtupleProducerTask` +Fills **histograms** of the requested variables from the histTuples, including systematic +variations. **Branches over variables.** + +- **Parameters:** `--variables` (string; restrict which variables), `--n-var-batches` (int, + default `10`; how variables are grouped into branches). + +### `HistMergerTask` +Merges the per-piece histograms into per-process histograms ready for plotting and fitting. + +- **Parameter:** `--variables` (string; restrict which variables). + +### `AnalysisCacheTask` +Pre-computes a per-event payload that later stages reuse — most importantly the **b-tag shape** +weights in HH→bb̄WW. Pulled in automatically when an analysis needs it. + +- **Parameter:** `--producer-to-run` (which cached payload producer to run). +- **Caveat:** on a cold cache this can be **time-consuming** (≈ 1 h per branch). Reuse it across + runs via a [per-task version override](../workflow/arguments.md#per-task-version-overrides). + +### `AnalysisCacheAggregationTask` +Aggregates the cached payloads produced by `AnalysisCacheTask` into the form the histogram stages +consume. + +- **Parameter:** `--producer-to-aggregate`. + +### `HistPlotTask` +Produces the final **plots**. **Branches over variables** (one branch per variable). + +- **Parameter:** `--variables` (string; restrict which variables). + +## Statistical-inference tasks + +The limit/fit tasks (e.g. `PlotResonantLimits`, `PlotPullsAndImpacts`) come from the +`StatInference` and `inference`/`dhi` submodules and run inside CMSSW/Combine. They are +analysis-specific — see each HH analysis's **Statistical inference** page (via +[Analyses](../analyses.md)) and the [walkthrough](../workflow/walkthrough.md#stage-5-statistical-inference). + +!!! tip "Discover parameters from the command line" + `law run --help` lists every parameter a task accepts, including the ones inherited from + the base classes. diff --git a/docs/stat_inference.md b/docs/stat_inference.md deleted file mode 100644 index 39223de62..000000000 --- a/docs/stat_inference.md +++ /dev/null @@ -1,36 +0,0 @@ -## How to run limits -1. As a temporary workaround, if you want to run multiplie commands, to avoid delays to load environment each time run: - ```sh - cmbEnv /bin/zsh # or /bin/bash - ``` - Alternatively add `cmbEnv` in front of each command. E.g. - ```sh - cmbEnv python3 -c 'print("hello")' - ``` - -1. Create datacards. - ```sh - python3 StatInference/dc_make/create_datacards.py --input PATH_TO_SHAPES --output PATH_TO_CARDS --config PATH_TO_CONFIG - ``` - Available configurations: - - For X->HH>bbtautau Run 2: [StatInference/config/x_hh_bbtautau_run2.yaml](https://github.com/cms-flaf/StatInference/blob/main/config/x_hh_bbtautau_run2.yaml) - - For X->HH->bbWW Run 3: [StatInference/config/x_hh_bbww_run3.yaml](https://github.com/cms-flaf/StatInference/blob/main/config/x_hh_bbww_run3.yaml) - -1. Run limits. - ```sh - law run PlotResonantLimits --version dev --datacards 'PATH_TO_CARDS/*.txt' --xsec fb --y-log - ``` - Hints: - - use `--workflow htcondor` to submit on HTCondor (by default it runs locally) - - add `--remove-output 4,a,y` to remove previous output files - - add `--print-status 0` to get status of the workflow (where `0` is a depth). Useful to get the output file name. - - for more details see [cms-hh inference documentation](https://cms-hh.web.cern.ch/tools/inference/) - -2. Plot Pulls and Impacts - ```sh - PlotPullsAndImpacts --version dev --datacards "PATH_TO_CARDS/specific_card.txt" --hh-model NO_STR --parameter-values r=1 --parameter-ranges r,-100,100 --method robust --PlotPullsAndImpacts-order-by-impact True --mc-stats True --PullsAndImpacts-custom-args="--expectSignal=1" - ``` - Hints: - - Don't use datacards as *.txt because pulls should be done for each mass point separately - - add `--remove-output 4,a,y` to remove previous output files - - add `--print-status 0` to get status of the workflow (where `0` is a depth). Useful to get the output file name. \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 000000000..c89f83797 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,98 @@ +# Troubleshooting & FAQ + +The most common ways a FLAF run goes wrong, and how to fix them. If your symptom is not here, check +the job logs (run with `--transfer-logs` on HTCondor) and the task status +(`--print-status 3,1`). + +## `law: command not found` +You did not `source env.sh` in this shell. Every new terminal needs it once +([Installation](getting-started/installation.md)). + +## Import errors / empty submodule directories +You cloned without `--recursive`, so submodules (FLAF, PlotKit, physics tools) are empty. Fix: + +```sh +git submodule update --init --recursive +``` + +## A run unexpectedly drops into `InputFileTask` / DAS errors +For a from-scratch production, `InputFileTask` running first is normal. But if a run that should +reuse existing outputs keeps re-resolving inputs, or fails here, the cause is almost always a +**wrong `--period` or `--version`** (so the expected upstream outputs aren't found and LAW falls +back to regenerating them), or an **expired proxy**. Double-check the era/version, and: + +```sh +voms-proxy-info # is it still valid? +voms-proxy-init -voms cms -rfc -valid 192:00 +``` + +## "Permission denied" / "file not found" on storage +Usually an **expired VOMS proxy** — grid/EOS access needs a valid one. Re-run `voms-proxy-init`. If +it persists, confirm your `fs_*` paths in `user_custom.yaml` are correct and writable +([Storage](concepts/storage.md)). + +## "Task not found" after adding a task +LAW's index is stale. Re-run: + +```sh +law index --verbose +``` + +Needed after **adding/renaming/moving** a task class (not after editing an existing one's body). + +## EOS read-after-write lag +EOS is eventually consistent: a file you just wrote can be briefly invisible to an existence check +(seconds, occasionally longer). In normal pipeline use FLAF tolerates this. If **your own** script +checks for freshly written outputs and intermittently "can't find" them, don't trust a single +`exists()` — list the parent directory and retry a few times with a short delay. + +## Cross-analysis environment contamination +The environment caches paths in variables (`FLAF_PATH`, `ANALYSIS_PATH`, `ANALYSIS_SOFT_PATH`, …). +Reusing a shell that already set up a *different* analysis can pick up the wrong `flaf_env` and +produce baffling failures. + +- **Interactive:** use a **fresh shell per analysis** and `source env.sh` there. +- **Scripted/background runs:** unset the FLAF/analysis variables before sourcing, but **keep** + `LD_LIBRARY_PATH`, `HOME` and `PATH`: + +```sh +unset FLAF_ENVIRONMENT_PATH ANALYSIS_SOFT_PATH LAW_HOME LAW_CONFIG_FILE \ + ANALYSIS_PATH ANALYSIS_DATA_PATH FLAF_PATH FLAF_CMSSW_BASE \ + FLAF_CMSSW_ARCH FLAF_CMSSW_VERSION FLAF_COMBINE_PATH \ + X509_USER_PROXY VIRTUAL_ENV PYTHONPATH +cd /path/to/ +source env.sh +``` + +## ROOT/cling library or JIT errors in a background run +You launched the environment under `env -i`, which strips `LD_LIBRARY_PATH` (ROOT/cling needs it). +Preserve it (and `HOME`, `PATH`) when starting a clean shell. See +[The environment](concepts/environment.md#sharp-edges). + +## `source env.sh` sets the wrong path / fails to locate itself +You sourced it via `bash -c "source env.sh"`. That breaks `BASH_SOURCE` self-detection and sets the +wrong `ANALYSIS_PATH`. Source it directly in your interactive shell, or put your commands in a +**script file** and run that file. + +## HH→bb̄WW: the run sits in `AnalysisCacheTask` for a long time +Expected on a cold cache: `AnalysisCacheTask` computes the b-tag shape weights and can take roughly +an hour per branch. Reuse an existing cache across runs with a +[per-task version override](workflow/arguments.md#per-task-version-overrides) instead of +recomputing it every time. + +## A backgrounded `law run` won't stop when I kill it +Killing the parent leaves child `law`/job processes alive. Kill by pattern, and remove batch jobs: + +```sh +pkill -f "version=" +condor_rm # if you submitted to HTCondor +``` + +## My edits to FLAF/Corrections are ignored +You edited the submodule copy but the run used a different one — or vice-versa. The run uses +`FLAF_PATH`/`CORRECTIONS_PATH`; set them to your edited copy **before** `source env.sh`. See +[Developing shared submodules](concepts/environment.md#developing-shared-submodules). + +## The first `source env.sh` takes forever +Expected: the first time it builds CMSSW and Combine (tens of minutes, a few GB under `soft/`). +Subsequent sources are quick. Don't interrupt the first build. diff --git a/docs/workflow/arguments.md b/docs/workflow/arguments.md new file mode 100644 index 000000000..2b54dd1cc --- /dev/null +++ b/docs/workflow/arguments.md @@ -0,0 +1,93 @@ +# Command arguments + +A reference for the options you pass to `law run`. The **common** ones are defined on FLAF's base +task classes (`FLAF/run_tools/law_customizations.py`), so they work on **every** FLAF task. LAW +also provides built-in options for status and cleanup. + +!!! note "Underscores become dashes on the command line" + A parameter named `transfer_logs` in the code is `--transfer-logs` on the CLI; + `anaTuple_version` is `--anaTuple-version`, and so on. + +## Common task options + +| Option | Default | Meaning | +|---|---|---| +| `--version` | *(required)* | Label that namespaces this run's outputs. Different versions never collide. | +| `--period` | *(required)* | The [era](../concepts/eras.md), e.g. `Run3_2022`. | +| `--workflow` | `local` | `local` (this machine) or `htcondor` (batch). See [HTCondor](htcondor.md). | +| `--branches` | *(all)* | Which branches to run, e.g. `0`, `0,2`, `5-7`. Restricts only the launched task, not its dependencies. | +| `--test` | `-1` | Process only N events per input file (`-1` = all). Great for smoke tests. | +| `--process` | `""` | Restrict to one process (e.g. `custom_CI_Signal`). | +| `--dataset` | `""` | Restrict to one dataset. | +| `--model` | `""` | Override the physics model for this run. | +| `--customisations` | `""` | Ad-hoc `key=value,key=value` overrides (see below). | +| `--user-custom` | `""` | Path to an extra `user_custom`-style YAML, loaded last (see below). | + +## HTCondor options (on every workflow task) + +| Option | Default | Meaning | +|---|---|---| +| `--transfer-logs` | off | Bring job logs back to `data/`. Recommended. | +| `--parallel-jobs` | *(unbounded)* | Cap concurrent branches, e.g. `--parallel-jobs 100`. | +| `--max-runtime` | *(task default)* | Per-job wall-clock limit. | +| `--n-cpus` | `1` | CPUs requested per job. | +| `--priority` | `0` | Job priority. | +| `--bundle` | off | Ship a code/environment tarball to the worker. See [HTCondor → bundles](htcondor.md#bundles-shipping-the-code-to-workers). | +| `--htcondor-spool` | off | Spool job files to the schedd. | + +## Status & cleanup (LAW built-ins) + +| Option | Meaning | +|---|---| +| `--print-status N,K` | Show the dependency tree status to task depth `N`, file-collection depth `K`. Also prints output paths. `--print-status 3,1` is a good default. | +| `--print-deps N` | Print the dependency tree to depth `N` without checking outputs. | +| `--remove-output N,a,y` | Remove outputs to depth `N` (`a` = all branches, `y` = no prompt). Forces a recompute. **Deletes real files** — check the version first. | + +## `--customisations` + +Pass analysis-specific overrides as a comma-separated list: + +```sh +--customisations key1=value1,key2=value2 +``` + +!!! info "HH→bb̄ττ: select the DeepTau version" + To run with DeepTau 2.5, add `--customisations deepTauVersion=2p5`. (See the HH_bbtautau docs.) + +## `--user-custom`: per-run config overlay + +`--user-custom ` loads an extra YAML **on top of** your `config/user_custom.yaml` (loaded +last, so its values win). Use an absolute path or one relative to `$ANALYSIS_PATH`. It is the +cleanest way to change settings for a single run without editing your committed file: + +```sh +law run FLAF.Analysis.tasks.HistPlotTask \ + --version test --period Run3_2022 --workflow local --branches 0 --test 1000 \ + --user-custom /afs/.../config/user_custom/test_local/HH_bbtautau.yaml +``` + +See [`user_custom.yaml`](../configuration/user-custom.md). + +## Per-task version overrides + +Every task carries its own `--version`, so you can make one run **read** an existing upstream +production while **writing** its downstream outputs under a new version. Override an upstream task's +version with `---version`: + +```sh +law run FLAF.Analysis.tasks.HistTupleProducerTask \ + --version my_dev \ + --AnaTupleMergeTask-version v2605 \ + --AnaTupleFileListTask-version v2605 \ + --period Run3_2022EE --workflow local +``` + +Here the anaTuples are reused from the central `v2605` production, while the histTuples are written +under `my_dev`. This is the key to fast, parallel development: many people can share one upstream +production without recomputing it. The base task also exposes related shortcuts +(`--anaTuple-version`, `--anaCache-version`, `--ana-version`) used by some stages. + +!!! tip "`---` works generally" + LAW lets you set *any* parameter of *any* task in the dependency tree by prefixing it with the + task's class name. Version overrides are the most common case, but the same mechanism applies + to other parameters. diff --git a/docs/workflow/htcondor.md b/docs/workflow/htcondor.md new file mode 100644 index 000000000..50be05760 --- /dev/null +++ b/docs/workflow/htcondor.md @@ -0,0 +1,75 @@ +# Running on HTCondor + +Producing ntuples and histograms for a full era means processing thousands of files — far too much +for one machine. FLAF tasks are **workflows** ([Tasks & LAW](../concepts/tasks-and-law.md)), so +their branches can be submitted to CERN's **HTCondor** batch system. The recommended pattern is to +**develop and test with `--workflow local`, then switch to `--workflow htcondor` for production** — +the command is otherwise the same. + +## Submit a task to the batch system + +```sh +law run FLAF.AnaProd.tasks.AnaTupleFileTask \ + --period Run3_2022 --version prod \ + --workflow htcondor \ + --transfer-logs \ + --parallel-jobs 100 +``` + +| Option | Why you want it | +|---|---| +| `--workflow htcondor` | Submit branches as batch jobs instead of running locally. | +| `--transfer-logs` | Bring each job's stdout/stderr back to your `data/` area. **Highly recommended** — without it, debugging a failed job is painful. | +| `--parallel-jobs 100` | Cap how many jobs are in flight at once. Be a good citizen on the shared pool; very large uncapped submissions are discouraged. | +| `--branches 0-99` | Submit only a subset (e.g. to retry a range). | + +Other HTCondor parameters available on every workflow task: `--max-runtime`, `--n-cpus`, +`--priority`, `--htcondor-spool`. See [Command arguments](arguments.md). + +## Monitor and resume + +LAW tracks which branches have finished (by checking their outputs), so a re-run only resubmits the +missing ones — batch jobs fail and time out, and resuming is normal. Check progress with: + +```sh +law run FLAF.AnaProd.tasks.AnaTupleFileTask \ + --period Run3_2022 --version prod --print-status 1,1 +``` + +Standard `condor_q` / `condor_status` work for the underlying jobs. + +## Bundles: shipping the code to workers + +A batch worker needs your code and environment. FLAF supports two modes: + +- **Non-bundle jobs** rely on the shared AFS area being mounted on the worker: the job receives + `FLAF_PATH`/`CORRECTIONS_PATH` and runs the code straight from AFS (including any edits you made + via the [dev overlay](../concepts/environment.md#developing-shared-submodules)). +- **Bundle jobs** ship a tarball of the code/environment to the worker (the `--bundle` flag and the + `BundleTask` machinery). The worker runs from the tarball and never reaches back to AFS, so it is + deliberately *not* given `FLAF_PATH`/`CORRECTIONS_PATH`. Bundles also set `FLAF_NO_INSTALL=1` so + the worker never tries to build the environment. + +For most work the defaults are correct; you only think about bundles when a stage explicitly needs +one (e.g. it declares a CMSSW bundle flavour) or when AFS is not available on the target pool. + +!!! tip "Your edits to FLAF *do* reach the workers" + Thanks to the dev overlay, non-bundle jobs run your edited `FLAF`/`Corrections`, and bundle + jobs include them in the tarball — so testing framework changes on HTCondor works without + committing first. See [Contributing](../contributing.md). + +## Caveats + +!!! warning "Keep your proxy valid for the whole run" + Jobs that outlive your VOMS proxy lose grid access mid-flight. Create a long-lived proxy + (`-valid 192:00`) before a big submission, and refresh it for long campaigns. + +!!! warning "Killing a background `law` leaves its jobs/children" + Pressing `Ctrl-C` or `kill`-ing a backgrounded `law` process does not necessarily stop the + branches it spawned. To stop everything for a run, match the processes by pattern, e.g. + `pkill -f "version=prod"`, and `condor_rm` the submitted jobs if needed. + +!!! note "Test small, then scale" + Validate a task with `--workflow local --branches 0 --test 1000` before submitting the full + workflow to HTCondor. A bug found on one local branch is far cheaper than one found across a + thousand batch jobs. diff --git a/docs/workflow/walkthrough.md b/docs/workflow/walkthrough.md new file mode 100644 index 000000000..797558ae4 --- /dev/null +++ b/docs/workflow/walkthrough.md @@ -0,0 +1,159 @@ +# Full workflow walkthrough + +This is the end-to-end tour of the pipeline: every stage, in order, with the command that runs it. +Read it once to understand the chain; in day-to-day work you usually run only the *last* stage you +need and let LAW produce the rest (see [the shortcut](#the-shortcut-just-ask-for-the-end)). + +The commands use `FLAF.AnaProd.tasks.*` for production stages and `FLAF.Analysis.tasks.*` for +analysis stages — the fully-qualified task paths the framework registers. + +## Setup recap + +```sh +cd HH_bbtautau # your analysis repository +source env.sh # once per shell +voms-proxy-info # confirm a valid proxy + +# Pick a data-taking era and a label for this production: +ERA=Run3_2022 +VER=dev +``` + +Throughout, `--period $ERA` selects the [era](../concepts/eras.md) and `--version $VER` namespaces +the [outputs](../concepts/data-flow.md#versions-keep-productions-apart). Add `--workflow local` +to run on this machine; switch to `--workflow htcondor` to scale up +([HTCondor guide](htcondor.md)). + +--- + +## Stage 0 — Resolve the input files + +`InputFileTask` turns "the datasets for this era" into a concrete list of NanoAOD files (from DAS). +Everything else depends on it, so it runs first — automatically when you launch a later stage, or +explicitly: + +```sh +law run FLAF.AnaProd.tasks.InputFileTask --period $ERA --version $VER --workflow local +``` + +It is fast and cheap. If a from-scratch run unexpectedly *stays* in `InputFileTask` or fails here, +suspect a wrong `--period`/`--version` or an expired proxy. + +## Stage 1 — Produce and merge analysis ntuples (anaTuples) + +`AnaTupleFileTask` runs the analysis producer (`AnaProd/anaTupleProducer.py`, inside CMSSW) over +each NanoAOD file — **one branch per file** — applying the object selections and +[corrections](../concepts/architecture.md#common-vs-analysis-specific) and writing a slimmed +**anaTuple**. `AnaTupleMergeTask` then merges the per-file pieces into one anaTuple per dataset. + +```sh +# Produce per-file anaTuples (heavy; normally on HTCondor): +law run FLAF.AnaProd.tasks.AnaTupleFileTask --period $ERA --version $VER --workflow local + +# Merge them per dataset: +law run FLAF.AnaProd.tasks.AnaTupleMergeTask --period $ERA --version $VER --workflow local +``` + +!!! tip "Test on a few files first" + `--branches 0,1,2` runs only the first three input files, and `--test 1000` processes only + 1000 events per file. Combine both to smoke-test ntuple production quickly. + +## Stage 2 — Compute analysis observables (histTuples) + +`HistTupleProducerTask` reads the merged anaTuples and computes the heavier analysis +**observables** (the "payload producers" configured in `global.yaml`), writing **histTuples**: + +```sh +law run FLAF.Analysis.tasks.HistTupleProducerTask --period $ERA --version $VER --workflow local +``` + +!!! note "HH→bb̄WW: a caching step runs here first" + In HH→bb̄WW, `AnalysisCacheTask` (and `AnalysisCacheAggregationTask`) pre-compute and aggregate + per-event payloads — notably the **b-tag shape** weights — before histogramming. They are + pulled in automatically and can be **time-consuming** (budget roughly an hour per branch on a + cold cache). See the [HH_bbWW docs](../analyses.md) and the [Task reference](../reference/tasks.md). + +## Stage 3 — Fill and merge histograms + +`HistFromNtupleProducerTask` fills **histograms** of the requested variables from the histTuples — +**one branch per variable** — including systematic variations. `HistMergerTask` merges the pieces +into per-process histograms ready for plotting and fitting. + +```sh +# Fill histograms (restrict variables with --variables, batch with --n-var-batches): +law run FLAF.Analysis.tasks.HistFromNtupleProducerTask --period $ERA --version $VER --workflow local + +# Merge them: +law run FLAF.Analysis.tasks.HistMergerTask --period $ERA --version $VER --workflow local +``` + +Which variables are produced is controlled by the analysis config and can be narrowed with the +`--variables` parameter or the `variables:` list in `user_custom.yaml`. + +## Stage 4 — Make the plots + +`HistPlotTask` produces the final plots — **one branch per variable**: + +```sh +law run FLAF.Analysis.tasks.HistPlotTask --period $ERA --version $VER --workflow local +# one variable only: +law run FLAF.Analysis.tasks.HistPlotTask --period $ERA --version $VER --workflow local --branches 0 +``` + +This is the task you most often launch directly: asking for the plots makes LAW produce every +upstream product that is missing. + +## Stage 5 — Statistical inference + +The two HH analyses turn the merged histograms into datacards and then run limits and diagnostics +with [Combine](https://cms-analysis.github.io/HiggsAnalysis-CombinedLimit/), via the +`StatInference` and `inference` submodules. H→μμ does not include this stage. + +Because these commands run inside CMSSW/Combine, prefix them with `cmsEnv` (or open a `cmsEnv` +subshell once): + +```sh +# 1) Create datacards from the produced shapes: +cmsEnv python3 StatInference/dc_make/create_datacards.py \ + --input \ + --output \ + --config # e.g. StatInference/config/x_hh_bbww_run3.yaml + +# 2) Run resonant limits: +law run PlotResonantLimits --version $VER --datacards '/*.txt' --xsec fb --y-log + +# 3) Pulls & impacts (per mass point — point at a single card): +PlotPullsAndImpacts --version $VER --datacards "/.txt" ... +``` + +The exact configs and options are analysis-specific — see each analysis's **Statistical +inference** page (linked from [Analyses](../analyses.md)) and the +[cms-hh inference docs](https://cms-hh.web.cern.ch/tools/inference/). + +--- + +## The shortcut: just ask for the end + +You rarely run the stages one by one. Because every task knows its dependencies, launching a late +stage runs all missing upstream stages automatically: + +```sh +law run FLAF.Analysis.tasks.HistPlotTask --period $ERA --version $VER --workflow local +``` + +Run the individual stages explicitly only when you want to **stop at** an intermediate product +(e.g. produce anaTuples for someone else to use), or to inspect/debug one stage. + +## See progress and redo selectively + +```sh +# Status of the whole tree (task depth 3, file depth 1) — also prints output paths: +law run FLAF.Analysis.tasks.HistPlotTask --period $ERA --version $VER --print-status 3,1 + +# Force one stage to be recomputed: +law run FLAF.Analysis.tasks.HistMergerTask --period $ERA --version $VER --remove-output 0,a,y +``` + +See [Command arguments](arguments.md) for the full option list, and [Running on HTCondor](htcondor.md) +to take any of these commands to the batch system by swapping `--workflow local` for +`--workflow htcondor`. diff --git a/mkdocs.yml b/mkdocs.yml index 0b8ce6fb1..f3cd8003c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,13 +1,19 @@ site_name: FLAF -repo_name: GitHub -repo_url: https://github.com/cms-flaf/Framework +site_description: >- + FLAF — the Flexible LAW-based Analysis Framework. Documentation for the shared + CMS analysis framework used by HH->bbtautau, HH->bbWW and H->mumu. +repo_name: cms-flaf/FLAF +repo_url: https://github.com/cms-flaf/FLAF +edit_uri: edit/main/docs/ + theme: name: material font: false features: - content.action.edit - content.action.view - - navigation.expand + - content.code.copy + - content.code.annotate - navigation.footer - navigation.indexes - navigation.sections @@ -26,6 +32,8 @@ markdown_extensions: - def_list - footnotes - meta + - md_in_html + - tables - toc: permalink: true # Python Markdown Extensions @@ -38,7 +46,8 @@ markdown_extensions: - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - - pymdownx.highlight + - pymdownx.highlight: + anchor_linenums: true - pymdownx.inlinehilite - pymdownx.keys - pymdownx.mark @@ -53,26 +62,42 @@ markdown_extensions: - pymdownx.tasklist: custom_checkbox: true - pymdownx.tilde - - pymdownx.blocks.admonition - - pymdownx.blocks.details - - pymdownx.blocks.tab plugins: - search -extra_css: - - stylesheets/fonts.css - extra_javascript: - https://unpkg.com/mermaid@9.3/dist/mermaid.min.js - - https://polyfill.io/v3/polyfill.min.js?features=es6 - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js - nav: - Home: index.md - - Analysis: analysis.md - - HH->bbtautau: hh_bbtautau.md - - Statistical inference: stat_inference.md - -#theme: readthedocs \ No newline at end of file + - Getting started: + - Prerequisites: getting-started/prerequisites.md + - Installation: getting-started/installation.md + - Your first run: getting-started/first-run.md + - Key terms: getting-started/key-terms.md + - Concepts: + - Architecture: concepts/architecture.md + - Tasks & LAW: concepts/tasks-and-law.md + - Data flow: concepts/data-flow.md + - Configuration system: concepts/configuration.md + - Eras & periods: concepts/eras.md + - Storage & filesystems: concepts/storage.md + - The environment: concepts/environment.md + - Full workflow: + - Walkthrough: workflow/walkthrough.md + - Running on HTCondor: workflow/htcondor.md + - Command arguments: workflow/arguments.md + - Configuration guide: + - user_custom.yaml: configuration/user-custom.md + - Datasets: configuration/datasets.md + - Processes & models: configuration/processes-and-models.md + - Task reference: reference/tasks.md + - CI / CD: + - GitHub Actions: ci/github-actions.md + - Integration pipeline: ci/integration-pipeline.md + - Troubleshooting: troubleshooting.md + - Glossary: glossary.md + - Contributing: contributing.md + - Analyses: analyses.md