From 2ae28ba7f431cc9545afb3288bbc5f1c45405a65 Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Mon, 25 May 2026 14:10:51 +0200 Subject: [PATCH] chore(comfort): Add semantic line-break Rust/Markdown formatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `comfort` is a new contrib crate that reformats Rust doc comments (`///` and `//!`) and Markdown files using semantic line breaks — one sentence per line — with an optional `--max-width` safety net for long sentences. The pipeline runs in three layers: `extract` finds doc-comment blocks via `ra-ap-rustc_lexer`, `reflow_markdown` parses each block's body with comrak and walks the AST to locate reflowable paragraphs, and `reflow_paragraph` segments those paragraphs into sentences using a UAX #29 splitter with abbreviation-aware merging. Markdown structure (headings, code blocks, tables, block quotes, lists, footnotes, hard line breaks) is preserved verbatim; only prose paragraph contents are reflowed. Container blocks (block quotes, list items, task items, alerts, footnote definitions) each contribute the correct per-line continuation prefix so reflow output round-trips cleanly. Two optional passes compose on top of the always-on sembr engine: - `--format-markdown`: canonicalizes markdown structure via comrak's `format_commonmark` (normalizes list markers to `-`, prefers fenced code blocks, aligns table columns for visual display width including CJK wide characters) before reflowing. - `--reference-links`: converts inline `[text](url)` links to reference-style `[text]` with definitions consolidated and sorted alphabetically at the bottom of each body. Both transformations are idempotent individually and when composed. The tool ships two binaries from the same entry point: `comfort` (direct invocation, defaults to stdin/stdout) and `cargo-comfort` (cargo subcommand, defaults to `--workspace`). Common flags: `--check`, `--list-changed`, `--workspace`, `-p`/`--package`, `--exclude`, `--language`, `--stdin-filename`, `--max-width`. A new `fmt-comments-ci` justfile target runs `comfort --check --workspace` in CI, added to the `ci` recipe alongside the existing `fmt-ci` step. Signed-off-by: Jean Mertz --- .config/supply-chain/audits.toml | 26 + .config/supply-chain/config.toml | 4 - .config/supply-chain/imports.lock | 49 +- Cargo.lock | 68 +- Cargo.toml | 6 +- crates/contrib/comfort/Cargo.toml | 36 + .../contrib/comfort/src/bin/cargo-comfort.rs | 3 + crates/contrib/comfort/src/bin/comfort.rs | 3 + crates/contrib/comfort/src/cli.rs | 159 ++ crates/contrib/comfort/src/cli_tests.rs | 69 + crates/contrib/comfort/src/extract.rs | 228 +++ crates/contrib/comfort/src/extract_tests.rs | 199 ++ crates/contrib/comfort/src/format.rs | 1706 +++++++++++++++++ crates/contrib/comfort/src/format_tests.rs | 595 ++++++ crates/contrib/comfort/src/lib.rs | 126 ++ crates/contrib/comfort/src/lib_tests.rs | 1545 +++++++++++++++ crates/contrib/comfort/src/run.rs | 188 ++ crates/contrib/comfort/src/sentence.rs | 302 +++ crates/contrib/comfort/src/sentence_tests.rs | 267 +++ crates/contrib/comfort/src/walk.rs | 129 ++ crates/contrib/comfort/src/walk_tests.rs | 102 + deny.toml | 3 +- justfile | 10 +- 23 files changed, 5796 insertions(+), 27 deletions(-) create mode 100644 crates/contrib/comfort/Cargo.toml create mode 100644 crates/contrib/comfort/src/bin/cargo-comfort.rs create mode 100644 crates/contrib/comfort/src/bin/comfort.rs create mode 100644 crates/contrib/comfort/src/cli.rs create mode 100644 crates/contrib/comfort/src/cli_tests.rs create mode 100644 crates/contrib/comfort/src/extract.rs create mode 100644 crates/contrib/comfort/src/extract_tests.rs create mode 100644 crates/contrib/comfort/src/format.rs create mode 100644 crates/contrib/comfort/src/format_tests.rs create mode 100644 crates/contrib/comfort/src/lib.rs create mode 100644 crates/contrib/comfort/src/lib_tests.rs create mode 100644 crates/contrib/comfort/src/run.rs create mode 100644 crates/contrib/comfort/src/sentence.rs create mode 100644 crates/contrib/comfort/src/sentence_tests.rs create mode 100644 crates/contrib/comfort/src/walk.rs create mode 100644 crates/contrib/comfort/src/walk_tests.rs diff --git a/.config/supply-chain/audits.toml b/.config/supply-chain/audits.toml index 5dd03139..42ce71aa 100644 --- a/.config/supply-chain/audits.toml +++ b/.config/supply-chain/audits.toml @@ -21,6 +21,11 @@ who = "Jean Mertz " criteria = "safe-to-deploy" delta = "0.49.0 -> 0.50.0" +[[audits.comrak]] +who = "Jean Mertz " +criteria = "safe-to-deploy" +delta = "0.50.0 -> 0.52.0" + [[audits.datetime_literal]] who = "Jean Mertz " criteria = "safe-to-deploy" @@ -41,6 +46,11 @@ who = "Jean Mertz " criteria = "safe-to-deploy" delta = "0.16.2 -> 0.17.0" +[[audits.finl_unicode]] +who = "Jean Mertz " +criteria = "safe-to-deploy" +version = "1.4.0" + [[audits.futf]] who = "Jean Mertz " criteria = "safe-to-deploy" @@ -56,6 +66,11 @@ who = "Jean Mertz " criteria = "safe-to-deploy" version = "0.36.1" +[[audits.imara-diff]] +who = "Jean Mertz " +criteria = "safe-to-deploy" +version = "0.2.0" + [[audits.infer]] who = "Jean Mertz " criteria = "safe-to-deploy" @@ -111,6 +126,11 @@ who = "Jean Mertz " criteria = "safe-to-deploy" delta = "0.38.4 -> 0.39.2" +[[audits.ra-ap-rustc_lexer]] +who = "Jean Mertz " +criteria = "safe-to-deploy" +version = "0.167.0" + [[audits.rand_xorshift]] who = "Jean Mertz " criteria = "safe-to-deploy" @@ -1005,6 +1025,12 @@ user-id = 3618 # David Tolnay (dtolnay) start = "2021-10-02" end = "2027-02-13" +[[trusted.unicode-properties]] +criteria = "safe-to-deploy" +user-id = 1139 # Manish Goregaokar (Manishearth) +start = "2023-07-27" +end = "2027-05-21" + [[trusted.unsafe-libyaml]] criteria = "safe-to-deploy" user-id = 3618 # David Tolnay (dtolnay) diff --git a/.config/supply-chain/config.toml b/.config/supply-chain/config.toml index 4330acb4..84402393 100644 --- a/.config/supply-chain/config.toml +++ b/.config/supply-chain/config.toml @@ -683,10 +683,6 @@ criteria = "safe-to-deploy" version = "1.18.0" criteria = "safe-to-deploy" -[[exemptions.unicode_categories]] -version = "0.1.1" -criteria = "safe-to-deploy" - [[exemptions.untrusted]] version = "0.9.0" criteria = "safe-to-deploy" diff --git a/.config/supply-chain/imports.lock b/.config/supply-chain/imports.lock index c55c4688..f5f18ebd 100644 --- a/.config/supply-chain/imports.lock +++ b/.config/supply-chain/imports.lock @@ -399,8 +399,8 @@ user-login = "Amanieu" user-name = "Amanieu d'Antras" [[publisher.memchr]] -version = "2.7.5" -when = "2025-06-11" +version = "2.8.0" +when = "2026-02-06" user-id = 189 user-login = "BurntSushi" user-name = "Andrew Gallant" @@ -791,8 +791,8 @@ user-login = "BurntSushi" user-name = "Andrew Gallant" [[publisher.unicode-ident]] -version = "1.0.19" -when = "2025-09-10" +version = "1.0.24" +when = "2026-02-16" user-id = 3618 user-login = "dtolnay" user-name = "David Tolnay" @@ -804,6 +804,13 @@ user-id = 1139 user-login = "Manishearth" user-name = "Manish Goregaokar" +[[publisher.unicode-properties]] +version = "0.1.4" +when = "2025-10-30" +user-id = 1139 +user-login = "Manishearth" +user-name = "Manish Goregaokar" + [[publisher.unicode-segmentation]] version = "1.12.0" when = "2024-09-13" @@ -2752,7 +2759,7 @@ who = "Manish Goregaokar " criteria = "safe-to-deploy" user-id = 1139 # Manish Goregaokar (Manishearth) start = "2019-11-06" -end = "2026-02-01" +end = "2027-04-23" notes = "All code written or reviewed by Manish" aggregated-from = "https://hg.mozilla.org/mozilla-central/raw-file/tip/supply-chain/audits.toml" @@ -2761,7 +2768,7 @@ who = "Manish Goregaokar " criteria = "safe-to-deploy" user-id = 1139 # Manish Goregaokar (Manishearth) start = "2019-05-15" -end = "2026-02-01" +end = "2027-04-23" notes = "All code written or reviewed by Manish" aggregated-from = "https://hg.mozilla.org/mozilla-central/raw-file/tip/supply-chain/audits.toml" @@ -3560,6 +3567,36 @@ criteria = "safe-to-deploy" delta = "0.13.1 -> 0.13.2" aggregated-from = "https://raw.githubusercontent.com/mozilla/cargo-vet/main/supply-chain/audits.toml" +[[audits.mozilla.audits.textwrap]] +who = "Jan-Erik Rediger " +criteria = "safe-to-deploy" +version = "0.15.0" +aggregated-from = "https://raw.githubusercontent.com/mozilla/glean/main/supply-chain/audits.toml" + +[[audits.mozilla.audits.textwrap]] +who = "Mike Hommey " +criteria = "safe-to-deploy" +delta = "0.15.0 -> 0.15.2" +aggregated-from = "https://hg.mozilla.org/mozilla-central/raw-file/tip/supply-chain/audits.toml" + +[[audits.mozilla.audits.textwrap]] +who = "Mike Hommey " +criteria = "safe-to-deploy" +delta = "0.15.2 -> 0.16.0" +aggregated-from = "https://hg.mozilla.org/mozilla-central/raw-file/tip/supply-chain/audits.toml" + +[[audits.mozilla.audits.textwrap]] +who = "Jan-Erik Rediger " +criteria = "safe-to-deploy" +delta = "0.16.0 -> 0.16.1" +aggregated-from = "https://hg.mozilla.org/mozilla-central/raw-file/tip/supply-chain/audits.toml" + +[[audits.mozilla.audits.textwrap]] +who = "Nika Layzell " +criteria = "safe-to-deploy" +delta = "0.16.1 -> 0.16.2" +aggregated-from = "https://raw.githubusercontent.com/mozilla/cargo-vet/main/supply-chain/audits.toml" + [[audits.mozilla.audits.thiserror]] who = "Jan-Erik Rediger " criteria = "safe-to-deploy" diff --git a/Cargo.lock b/Cargo.lock index 8d50c667..f18844d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -565,6 +565,25 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "comfort" +version = "0.1.0" +dependencies = [ + "cargo_metadata", + "clap", + "comrak", + "ignore", + "indoc", + "pretty_assertions", + "ra-ap-rustc_lexer", + "regex", + "similar", + "textwrap", + "thiserror 2.0.18", + "unicode-segmentation", + "unicode-width", +] + [[package]] name = "comfy-table" version = "7.2.1" @@ -580,19 +599,19 @@ dependencies = [ [[package]] name = "comrak" -version = "0.50.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321d20bf105b6871a49da44c5fbb93e90a7cd6178ea5a9fe6cbc1e6d4504bc5e" +checksum = "aac0b255932a9cd52fbfd664b67957f9f2e095ae4711cb0e41b4e291edef94c2" dependencies = [ "caseless", "entities", + "finl_unicode", "jetscii", "phf", "phf_codegen", "rustc-hash 2.1.1", "smallvec", "typed-arena", - "unicode_categories", ] [[package]] @@ -1196,6 +1215,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "finl_unicode" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9844ddc3a6e533d62bba727eb6c28b5d360921d5175e9ff0f1e621a5c590a4d5" + [[package]] name = "flate2" version = "1.1.2" @@ -2701,9 +2726,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "memmap2" @@ -3232,6 +3257,17 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "ra-ap-rustc_lexer" +version = "0.167.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ff5a3b958382dbdfb5bd325ad10643da18f83b3894485908b5d20b37abc0a" +dependencies = [ + "memchr", + "unicode-ident", + "unicode-properties", +] + [[package]] name = "rand" version = "0.9.2" @@ -4322,6 +4358,12 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f18aa187839b2bdb1ad2fa35ead8c4c2976b64e4363c386d45ac0f7ee85c9233" +[[package]] +name = "textwrap" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057" + [[package]] name = "thiserror" version = "1.0.69" @@ -4768,9 +4810,9 @@ checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-normalization" @@ -4781,6 +4823,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -4793,12 +4841,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode_categories" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" - [[package]] name = "unsafe-libyaml" version = "0.2.11" diff --git a/Cargo.toml b/Cargo.toml index 7636edf2..2be32280 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", default-features = false } clean-path = { version = "0.2", default-features = false } comfy-table = { version = "7", default-features = false } -comrak = { version = "0.50", default-features = false } +comrak = { version = "0.52", default-features = false } convert_case = { version = "0.11", default-features = false } crossbeam-channel = { version = "0.5", default-features = false } crossterm = { version = "0.29", default-features = false } @@ -98,7 +98,9 @@ proc-macro2 = { version = "1", default-features = false } proptest = { version = "1", default-features = false } quick-xml = { version = "0.39", default-features = false } quote = { version = "1", default-features = false } +ra-ap-rustc_lexer = { version = "0.167", default-features = false } rayon = { version = "1", default-features = false } +regex = { version = "1", default-features = false } relative-path = { version = "2", default-features = false } reqwest = { version = "0.12", default-features = false } reqwest-eventsource = { version = "0.6", default-features = false } @@ -125,6 +127,7 @@ strip-ansi-escapes = { version = "0.2", default-features = false } syn = { version = "2", default-features = false } syntect = { version = "5.3", default-features = false } test-log = { version = "0.2", default-features = false, features = ["trace"] } +textwrap = { version = "0.16", default-features = false } thiserror = { version = "2", default-features = false } timeago = { version = "0.6", default-features = false } tokio = { version = "1", default-features = false, features = ["full"] } @@ -136,6 +139,7 @@ tracing = { version = "0.1", default-features = false } tracing-subscriber = { version = "0.3", default-features = false } two-face = { version = "0.5", default-features = false } typetag = { version = "0.2", default-features = false } +unicode-segmentation = { version = "1", default-features = false } unicode-width = { version = "0.2", default-features = false } url = { version = "2", default-features = false } which = { version = "8", default-features = false } diff --git a/crates/contrib/comfort/Cargo.toml b/crates/contrib/comfort/Cargo.toml new file mode 100644 index 00000000..7f1d4302 --- /dev/null +++ b/crates/contrib/comfort/Cargo.toml @@ -0,0 +1,36 @@ +[package] +description = "Semantic line-break formatter for Rust doc comments." +name = "comfort" + +authors.workspace = true +documentation.workspace = true +edition.workspace = true +homepage.workspace = true +license-file.workspace = true +publish.workspace = true +readme.workspace = true +repository.workspace = true +version.workspace = true + +[dependencies] +cargo_metadata = { workspace = true } +clap = { workspace = true, features = ["std", "derive", "help", "usage", "error-context"] } +comrak = { workspace = true } +ignore = { workspace = true } +ra-ap-rustc_lexer = { workspace = true } +regex = { workspace = true, features = ["std", "perf", "unicode-perl"] } +similar = { workspace = true, features = ["text"] } +textwrap = { workspace = true } +thiserror = { workspace = true } +unicode-segmentation = { workspace = true } +unicode-width = { workspace = true } + +[dev-dependencies] +indoc = { workspace = true } +pretty_assertions = { workspace = true, features = ["std"] } + +[lints] +workspace = true + +[lib] +doctest = false diff --git a/crates/contrib/comfort/src/bin/cargo-comfort.rs b/crates/contrib/comfort/src/bin/cargo-comfort.rs new file mode 100644 index 00000000..78186415 --- /dev/null +++ b/crates/contrib/comfort/src/bin/cargo-comfort.rs @@ -0,0 +1,3 @@ +fn main() -> std::process::ExitCode { + comfort::cli_main() +} diff --git a/crates/contrib/comfort/src/bin/comfort.rs b/crates/contrib/comfort/src/bin/comfort.rs new file mode 100644 index 00000000..78186415 --- /dev/null +++ b/crates/contrib/comfort/src/bin/comfort.rs @@ -0,0 +1,3 @@ +fn main() -> std::process::ExitCode { + comfort::cli_main() +} diff --git a/crates/contrib/comfort/src/cli.rs b/crates/contrib/comfort/src/cli.rs new file mode 100644 index 00000000..e8896564 --- /dev/null +++ b/crates/contrib/comfort/src/cli.rs @@ -0,0 +1,159 @@ +//! CLI argument definitions. +//! +//! Two binaries point at the same `main.rs`: `comfort` (direct) and +//! `cargo-comfort` (a cargo subcommand). +//! The binary entry detects which one it was invoked as, strips the leading +//! `comfort` argv inserted by cargo, and adjusts defaults — direct invocation +//! defaults to stdin/stdout, cargo invocation defaults to `--workspace`. + +use std::path::PathBuf; + +use clap::Parser; + +use crate::DEFAULT_MAX_WIDTH; + +/// Source language to format. +/// With [`Auto`], per-file detection (extension or `--stdin-filename`) +/// determines the format and workspace/directory walks include both Rust and +/// Markdown files. +/// With an explicit language, every selected file is formatted as that language +/// and walks filter to its extensions only. +/// +/// [`Auto`]: Language::Auto +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, clap::ValueEnum)] +#[clap(rename_all = "kebab-case")] +pub enum Language { + /// Detect per file: `.rs` → Rust, `.md`/`.markdown` → Markdown, + /// everything else → Rust (the stdin default and the dominant use case). + #[default] + Auto, + /// Force Rust mode regardless of extension. + Rust, + /// Force Markdown mode regardless of extension. + Markdown, +} + +impl Language { + /// Resolve the effective format for a given file path. + /// `None` for `path` means the caller has no filename hint (e.g. stdin + /// without `--stdin-filename`), in which case `Auto` defaults to Rust. + #[must_use] + pub fn resolve(self, path: Option<&std::path::Path>) -> Format { + match self { + Self::Rust => Format::Rust, + Self::Markdown => Format::Markdown, + Self::Auto => match path.and_then(|p| p.extension()).and_then(|e| e.to_str()) { + Some("md" | "markdown") => Format::Markdown, + _ => Format::Rust, + }, + } + } +} + +/// Resolved per-file format used by [`run`] to dispatch to the correct +/// pipeline. +/// +/// [`run`]: crate::run::run +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Format { + Rust, + Markdown, +} + +#[cfg(test)] +#[path = "cli_tests.rs"] +mod tests; + +/// How the binary was invoked. +/// Determines whether the empty-args default is stdin (direct) or `--workspace` +/// (cargo subcommand). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Invocation { + Direct, + Cargo, +} + +#[derive(Debug, Parser)] +#[command( + name = "comfort", + about = "Format Rust doc comments with semantic line breaks.", + long_about = "Reflows outer (`///`) and inner (`//!`) doc-comment blocks using semantic line \ + breaks (one sentence per line), with an optional `--max-width` safety net. \ + Inline `//` comments and `/** */` block-style doc comments are left untouched.", + version +)] +pub struct Cli { + /// Files or directories to format. + /// Directories are walked recursively; `.gitignore` is honored. + /// If no paths are given, the tool reads from stdin and writes to stdout + /// (direct invocation) or walks the whole workspace (cargo subcommand). + /// + /// Mutually exclusive with `--workspace`, `--package`, and `--exclude`. + #[arg(conflicts_with_all = ["workspace", "packages", "exclude"])] + pub paths: Vec, + + /// Format every `.rs` file under the current cargo workspace. + /// Default for `cargo comfort`; explicit for `comfort`. + #[arg(long)] + pub workspace: bool, + + /// Limit the workspace walk to the named package(s). + /// Repeat the flag for multiple packages. + /// Implies workspace mode. + #[arg(short = 'p', long = "package", value_name = "SPEC")] + pub packages: Vec, + + /// Exclude the named package(s) from the workspace walk. + /// Repeat the flag for multiple packages. + /// Implies workspace mode. + #[arg(long = "exclude", value_name = "SPEC")] + pub exclude: Vec, + + /// Check whether files would change; print a diff and exit non-zero if any + /// do. + /// Never writes to disk. + #[arg(long)] + pub check: bool, + + /// Print the path of each changed file to stdout, one per line. + /// In write mode, lists files that were reformatted; in `--check` mode, + /// lists files that would be reformatted (and suppresses the diff). + #[arg(long)] + pub list_changed: bool, + + /// Force a specific source language. + /// With `auto` (default), detect from each file's extension and let + /// workspace/directory walks pick up both Rust and Markdown. + /// With `rust` or `markdown`, every selected file is formatted in that mode + /// and walks filter to its extensions only. + #[arg(long, value_enum, default_value_t = Language::Auto)] + pub language: Language, + + /// Also canonicalize the markdown structure of each formatted body: align + /// tables, normalise list markers, prefer fenced over indented code blocks, + /// etc. Off by default — in default mode, only paragraph prose gets + /// reflowed and everything else is preserved byte-for-byte. + #[arg(long)] + pub format_markdown: bool, + + /// Convert inline markdown links to reference-style links and move all + /// reference definitions to the bottom of the body. + /// Adaptive: shortcut form `[text]` where possible, full form + /// `[text][label]` for collisions. + /// Independent of `--format-markdown` — enable either, both, or neither. + #[arg(long)] + pub reference_links: bool, + + /// Maximum line width for reflow. + /// Long sentences wrap at word boundaries within sembr blocks. + /// `0` disables width wrapping. + #[arg(long, default_value_t = DEFAULT_MAX_WIDTH)] + pub max_width: usize, + + /// The original filename for content piped via stdin. + /// In `--language auto` (default), the extension drives format detection — + /// e.g. `--stdin-filename notes.md` switches to Markdown mode. + /// Also improves diagnostic messages; defaults to ``. + #[arg(long, value_name = "PATH")] + pub stdin_filename: Option, +} diff --git a/crates/contrib/comfort/src/cli_tests.rs b/crates/contrib/comfort/src/cli_tests.rs new file mode 100644 index 00000000..cf551126 --- /dev/null +++ b/crates/contrib/comfort/src/cli_tests.rs @@ -0,0 +1,69 @@ +//! Tests for the language resolution rules: which `Format` we end up with given +//! the `--language` flag and an optional filename hint. + +use std::path::Path; + +use pretty_assertions::assert_eq; + +use super::{Format, Language}; + +#[test] +fn auto_with_rust_extension_resolves_to_rust() { + assert_eq!( + Language::Auto.resolve(Some(Path::new("foo.rs"))), + Format::Rust + ); +} + +#[test] +fn auto_with_markdown_extension_resolves_to_markdown() { + assert_eq!( + Language::Auto.resolve(Some(Path::new("foo.md"))), + Format::Markdown + ); + assert_eq!( + Language::Auto.resolve(Some(Path::new("foo.markdown"))), + Format::Markdown + ); +} + +#[test] +fn auto_with_unknown_extension_falls_back_to_rust() { + assert_eq!( + Language::Auto.resolve(Some(Path::new("foo.txt"))), + Format::Rust + ); +} + +#[test] +fn auto_with_no_filename_hint_falls_back_to_rust() { + // Stdin without `--stdin-filename`: no extension to detect from. + assert_eq!(Language::Auto.resolve(None), Format::Rust); +} + +#[test] +fn explicit_rust_overrides_markdown_extension() { + // The pushed-back case: user has rust code in a `.md` file (slides, + // generated stub, whatever) and forces rust mode. + assert_eq!( + Language::Rust.resolve(Some(Path::new("slides.md"))), + Format::Rust + ); +} + +#[test] +fn explicit_markdown_overrides_rust_extension() { + // Inverse of the above: rare but symmetric. + assert_eq!( + Language::Markdown.resolve(Some(Path::new("notes.rs"))), + Format::Markdown + ); +} + +#[test] +fn explicit_language_wins_over_missing_hint() { + // Stdin with `--language markdown` and no `--stdin-filename`: still + // markdown. + assert_eq!(Language::Markdown.resolve(None), Format::Markdown); + assert_eq!(Language::Rust.resolve(None), Format::Rust); +} diff --git a/crates/contrib/comfort/src/extract.rs b/crates/contrib/comfort/src/extract.rs new file mode 100644 index 00000000..f094eb37 --- /dev/null +++ b/crates/contrib/comfort/src/extract.rs @@ -0,0 +1,228 @@ +//! Doc-comment block extraction from Rust source. +//! +//! A *block* is a maximal run of consecutive line doc-comments — either outer +//! (`///`) or inner (`//!`) — sharing the same indentation and separated only +//! by a single newline. +//! Blank lines inside the block (i.e. +//! `///\n` with no body content) are part of the block; a truly blank source +//! line ends it. + +use std::ops::Range; + +use ra_ap_rustc_lexer::{DocStyle, FrontmatterAllowed, TokenKind, tokenize}; + +/// A contiguous run of `///` or `//!` lines in the source. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Block { + /// Byte range covering the block in the original source, starting at the + /// indent of the first line and ending just past the last comment line's + /// last character (not including its trailing newline). + pub range: Range, + + /// Whitespace prefix shared by every line of the block. + pub indent: String, + + /// Outer (`///`) or inner (`//!`). + pub style: DocStyle, + + /// Markdown body, one entry per source line, with the prefix and at most + /// one separator space stripped. + /// Empty entries represent blank doc-comment lines (a `///` with nothing + /// after it). + pub lines: Vec, +} + +impl Block { + /// The marker for this block's style: `///` or `//!`. + #[must_use] + pub fn marker(&self) -> &'static str { + match self.style { + DocStyle::Outer => "///", + DocStyle::Inner => "//!", + } + } + + /// Source-column overhead of this block's per-line prefix: indent, marker, + /// and separator space. + /// Used to compute how much of a global `max_width` budget is left for body + /// content. + /// Counted in bytes; for the all-ASCII whitespace indents Rust source uses, + /// this matches the rendered column count. + #[must_use] + pub fn prefix_width(&self) -> usize { + self.indent.len() + self.marker().len() + 1 + } + + /// Reassemble the block as Rust source from a freshly formatted markdown + /// body. + /// Lines are split on `\n`; non-empty lines get ` {indent}{marker} `, + /// empty lines get `{indent}{marker}` with no trailing space. + #[must_use] + pub fn reassemble(&self, formatted_body: &str) -> String { + let marker = self.marker(); + let mut out = String::with_capacity(formatted_body.len() + self.indent.len() * 4); + + for (i, line) in formatted_body.split('\n').enumerate() { + if i > 0 { + out.push('\n'); + } + out.push_str(&self.indent); + out.push_str(marker); + if !line.is_empty() { + out.push(' '); + out.push_str(line); + } + } + + out + } +} + +/// Find all outer/inner doc-comment blocks in `source`, in source order. +/// +/// Only line doc-comments are recognised. +/// Block doc-comments (`/** */`, `/*! */`) and regular `//` comments are +/// ignored. +/// Doc-comments that don't start at the line's first non-whitespace character +/// (e.g. a `///` trailing some code) are also skipped. +#[must_use] +pub fn find_blocks(source: &str) -> Vec { + let bytes = source.as_bytes(); + let mut blocks: Vec = Vec::new(); + let mut offset: usize = 0; + + // Pending block we're still extending across consecutive lines. + let mut pending: Option = None; + + for token in tokenize(source, FrontmatterAllowed::Yes) { + let token_start = offset; + let token_end = offset + token.len as usize; + offset = token_end; + + match token.kind { + TokenKind::LineComment { + doc_style: Some(style), + } => { + // Confirm the comment starts at the beginning of a logical line + // (only whitespace between the previous '\n' and this token). + let line_start = line_start_of(bytes, token_start); + let leading = &source[line_start..token_start]; + if !leading.chars().all(|c| c == ' ' || c == '\t') { + // Trailing comment after code; flush any pending block. + if let Some(prev) = pending.take() { + blocks.push(prev.into_block()); + } + continue; + } + + let body = extract_body(&source[token_start..token_end], style); + + match pending.as_mut() { + Some(prev) + if prev.style == style + && prev.indent == leading + && prev.next_line_start == line_start => + { + prev.lines.push(body); + prev.end = token_end; + } + _ => { + if let Some(prev) = pending.take() { + blocks.push(prev.into_block()); + } + pending = Some(PendingBlock { + start: line_start, + end: token_end, + indent: leading.to_owned(), + style, + lines: vec![body], + next_line_start: line_start, + }); + } + } + } + TokenKind::Whitespace => { + // The block extends across a single `\n`. Two or more newlines + // (a truly blank source line) break the block. + let Some(prev) = pending.as_mut() else { + continue; + }; + let ws = &source[token_start..token_end]; + let mut newline_idx = None; + let mut newlines = 0_usize; + for (i, b) in ws.bytes().enumerate() { + if b == b'\n' { + newlines += 1; + newline_idx = Some(i); + } + } + if newlines == 1 { + // Predict where the next line starts so we can confirm + // the next comment is at column 0 of that line. + let idx = newline_idx.unwrap_or(0); + prev.next_line_start = token_start + idx + 1; + } else if let Some(prev) = pending.take() { + blocks.push(prev.into_block()); + } + } + _ => { + if let Some(prev) = pending.take() { + blocks.push(prev.into_block()); + } + } + } + } + + if let Some(prev) = pending { + blocks.push(prev.into_block()); + } + + blocks +} + +struct PendingBlock { + start: usize, + end: usize, + indent: String, + style: DocStyle, + lines: Vec, + // Byte position where the next line begins, used to confirm that the + // following doc-comment (if any) is the first content on its line. + next_line_start: usize, +} + +impl PendingBlock { + fn into_block(self) -> Block { + Block { + range: self.start..self.end, + indent: self.indent, + style: self.style, + lines: self.lines, + } + } +} + +/// Strip the `///` / `//!` marker and an optional single separator space. +fn extract_body(raw: &str, style: DocStyle) -> String { + let marker = match style { + DocStyle::Outer => "///", + DocStyle::Inner => "//!", + }; + let rest = raw.strip_prefix(marker).unwrap_or(raw); + // Strip at most one separator space; preserve additional indentation so + // markdown code blocks indented by 4+ spaces survive the round-trip. + rest.strip_prefix(' ').unwrap_or(rest).to_owned() +} + +/// Walk backwards from `pos` to find the byte index just past the previous `\n` +/// (or 0 if there is none). +fn line_start_of(bytes: &[u8], pos: usize) -> usize { + bytes[..pos] + .iter() + .rposition(|b| *b == b'\n') + .map_or(0, |i| i + 1) +} + +#[cfg(test)] +#[path = "extract_tests.rs"] +mod tests; diff --git a/crates/contrib/comfort/src/extract_tests.rs b/crates/contrib/comfort/src/extract_tests.rs new file mode 100644 index 00000000..7c22a8fe --- /dev/null +++ b/crates/contrib/comfort/src/extract_tests.rs @@ -0,0 +1,199 @@ +use indoc::indoc; +use pretty_assertions::assert_eq; +use ra_ap_rustc_lexer::DocStyle; + +use super::{Block, find_blocks}; + +#[test] +fn finds_single_outer_block() { + let src = "/// One line.\nfn f() {}\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].style, DocStyle::Outer); + assert_eq!(blocks[0].indent, ""); + assert_eq!(blocks[0].lines, vec!["One line."]); + // Range covers `/// One line.` (13 chars), not the trailing newline. + assert_eq!(&src[blocks[0].range.clone()], "/// One line."); +} + +#[test] +fn finds_inner_doc_block() { + let src = "//! Module docs.\n//! Second line.\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].style, DocStyle::Inner); + assert_eq!(blocks[0].lines, vec!["Module docs.", "Second line."]); +} + +#[test] +fn groups_consecutive_outer_lines_into_one_block() { + let src = indoc! {" + /// First. + /// Second. + /// Third. + fn f() {} + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec!["First.", "Second.", "Third."]); +} + +#[test] +fn preserves_empty_doc_lines_within_block() { + let src = indoc! {" + /// First paragraph. + /// + /// Second paragraph. + fn f() {} + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec![ + "First paragraph.", + "", + "Second paragraph." + ]); +} + +#[test] +fn separates_blocks_across_blank_source_line() { + let src = indoc! {" + /// First block. + + /// Second block. + fn f() {} + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].lines, vec!["First block."]); + assert_eq!(blocks[1].lines, vec!["Second block."]); +} + +#[test] +fn separates_blocks_across_intervening_code() { + let src = indoc! {" + /// First. + fn f() {} + /// Second. + fn g() {} + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].lines, vec!["First."]); + assert_eq!(blocks[1].lines, vec!["Second."]); +} + +#[test] +fn separates_outer_from_inner_block() { + // Different doc styles never merge, even with no intervening code. + let src = indoc! {" + //! Module doc. + /// Item doc. + fn f() {} + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].style, DocStyle::Inner); + assert_eq!(blocks[1].style, DocStyle::Outer); +} + +#[test] +fn captures_indentation() { + let src = indoc! {" + mod m { + /// Indented doc. + /// Second line. + fn f() {} + } + "}; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].indent, " "); + assert_eq!(blocks[0].lines, vec!["Indented doc.", "Second line."]); +} + +#[test] +fn skips_trailing_doc_after_code_on_same_line() { + // `///` only triggers a doc-comment when it starts the line. A `///` + // after code on the same line is still a doc-comment token to the + // lexer but it's misplaced; we ignore it. + let src = "let x = 5; /// not really a doc\n/// real doc\nfn f() {}\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec!["real doc"]); +} + +#[test] +fn ignores_triple_slash_inside_string_literals() { + let src = "fn f() { let s = \"/// not a doc\"; }\n/// real doc\nfn g() {}\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec!["real doc"]); +} + +#[test] +fn ignores_block_doc_comments() { + let src = "/** outer block doc */\n/*! inner block doc */\n/// real doc\nfn f() {}\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec!["real doc"]); +} + +#[test] +fn preserves_extra_leading_whitespace_for_markdown_code_blocks() { + // `/// foo` (4 extra spaces) becomes ` foo` in the body, which is + // a 4-space-indented markdown code block. Only ONE separator space is + // stripped. + let src = "/// para\n/// code_block_line\n"; + let blocks = find_blocks(src); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].lines, vec!["para", " code_block_line"]); +} + +#[test] +fn block_marker_returns_outer_or_inner() { + let outer = Block { + range: 0..0, + indent: String::new(), + style: DocStyle::Outer, + lines: vec![], + }; + let inner = Block { + range: 0..0, + indent: String::new(), + style: DocStyle::Inner, + lines: vec![], + }; + assert_eq!(outer.marker(), "///"); + assert_eq!(inner.marker(), "//!"); +} + +#[test] +fn reassemble_uses_indent_and_marker() { + let block = Block { + range: 0..0, + indent: " ".to_owned(), + style: DocStyle::Outer, + lines: vec![], + }; + let formatted = "First line.\n\nSecond paragraph."; + let out = block.reassemble(formatted); + assert_eq!( + out, + " /// First line.\n ///\n /// Second paragraph." + ); +} + +#[test] +fn reassemble_does_not_add_trailing_space_on_empty_lines() { + let block = Block { + range: 0..0, + indent: String::new(), + style: DocStyle::Outer, + lines: vec![], + }; + let out = block.reassemble("a\n\nb"); + assert_eq!(out, "/// a\n///\n/// b"); + // Verify there's no `/// ` (with trailing space) on the empty line. + assert!(!out.contains("/// \n")); +} diff --git a/crates/contrib/comfort/src/format.rs b/crates/contrib/comfort/src/format.rs new file mode 100644 index 00000000..c482da9c --- /dev/null +++ b/crates/contrib/comfort/src/format.rs @@ -0,0 +1,1706 @@ +//! Pure source-string-in, source-string-out pipeline. +//! +//! The pipeline runs in three layers: +//! +//! 1. [`format_source`] finds `///` / `//!` doc-comment blocks via +//! [`find_blocks`] and splices their reformatted bodies back into the source +//! byte-for-byte. +//! 2. [`reflow_markdown`] parses each block's body with comrak, walks the AST +//! recursively, and hands each reflowable paragraph's text to +//! [`reflow_paragraph`]. +//! Leaf blocks that aren't paragraphs — reference link definitions, code +//! blocks, headings, tables, HTML blocks, thematic breaks — are preserved +//! verbatim, as are paragraphs that contain a hard line break (` \n ` or +//! `\\\n`). +//! 3. [`reflow_paragraph`] splits the paragraph into sentences with the +//! [`sentence`] module and width-wraps each sentence via `textwrap`, keeping +//! atomic tokens (URLs, paths, identifiers) intact even when they exceed +//! `max_width`. +//! +//! Containers we descend into: [`BlockQuote`], [`List`], [`Item`], +//! [`TaskItem`], [`Alert`], [`MultilineBlockQuote`], [`FootnoteDefinition`], +//! [`BlockDirective`]. +//! Each contributes a per-line continuation prefix that gets applied to every +//! line after the first (the first line's prefix is already in the source, +//! outside the Paragraph's sourcepos range). +//! +//! [`Alert`]: NodeValue::Alert +//! [`BlockDirective`]: NodeValue::BlockDirective +//! [`BlockQuote`]: NodeValue::BlockQuote +//! [`FootnoteDefinition`]: NodeValue::FootnoteDefinition +//! [`Item`]: NodeValue::Item +//! [`List`]: NodeValue::List +//! [`MultilineBlockQuote`]: NodeValue::MultilineBlockQuote +//! [`TaskItem`]: NodeValue::TaskItem +//! [`sentence`]: crate::sentence + +use std::{collections::HashMap, ops::Range, sync::Arc}; + +use comrak::{ + Arena, Options, ResolvedReference, + nodes::{AstNode, NodeValue, TableAlignment}, + options::{BrokenLinkCallback, BrokenLinkReference, Extension, ListStyleType, Parse, Render}, +}; +use textwrap::WordSplitter; +use unicode_width::UnicodeWidthStr; + +use crate::{extract::find_blocks, sentence::split_sentences}; + +/// Options that control which transformations the markdown pipeline applies on +/// top of the always-on sembr reflow. +/// Used by [`format_markdown_with`] and [`format_rust_source_with`]. +#[derive(Debug, Clone, Default)] +pub struct FormatOptions { + /// Maximum line width passed to the sembr engine. + pub max_width: usize, + /// `--format-markdown`: canonicalize markdown structure (tables, list + /// markers, fences, etc.) via comrak's formatter plus our table aligner. + pub canonical: bool, + /// `--reference-links`: convert inline links to reference style and + /// consolidate definitions at the bottom of each body. + pub reference_links: bool, +} + +/// Reformat every `///` and `//!` block in `source`, returning the new text. +/// +/// `max_width` is the maximum source-line width the user wants to enforce. +/// Per block, the effective width handed to the reflow step is reduced by the +/// block's prefix overhead — its leading indent plus the `///` or `//!` marker +/// and separator space — so the user-visible ceiling is honoured regardless of +/// how deep the doc comment is nested. +/// +/// Returns the original `source` (as a fresh `String`) when no blocks need +/// reflow. +#[must_use] +pub fn format_source(source: &str, max_width: usize) -> String { + format_rust_source_with(source, &FormatOptions { + max_width, + ..Default::default() + }) +} + +/// Like [`format_source`], but also canonicalize the markdown inside each `///` +/// / `//!` block before reflowing it. +/// See [`format_markdown_canonical`] for what canonicalisation entails. +/// +/// The doc-comment scaffolding (`///` prefix, indentation, surrounding code) is +/// still byte-preserved; only the *body* of each block is rewritten. +#[must_use] +pub fn format_source_canonical(source: &str, max_width: usize) -> String { + format_rust_source_with(source, &FormatOptions { + max_width, + canonical: true, + ..Default::default() + }) +} + +/// Option-aware Rust-source entry point. +/// Each `///` / `//!` block's body goes through [`format_markdown_with`] with +/// the same options applied to it (with `max_width` adjusted for the block's +/// prefix overhead). +#[must_use] +pub fn format_rust_source_with(source: &str, opts: &FormatOptions) -> String { + format_source_impl(source, opts.max_width, |body, effective_width| { + let inner = FormatOptions { + max_width: effective_width, + ..opts.clone() + }; + format_markdown_with(body, &inner) + }) +} + +/// Option-aware markdown entry point. +/// Composes the optional canonical and reference-link passes before running the +/// always-on sembr reflow. +/// +/// The output preserves the input's *exact* trailing-newline count. +/// This matters for callers that map newlines back to source structure — e.g. +/// nvim's conform.nvim `injected` formatter, which extracts the markdown body +/// of Rust doc comments, runs comfort on it, and re-inserts. +/// Collapsing `\n\n` to `\n` would silently drop the trailing empty `///` line +/// on every save. +#[must_use] +pub fn format_markdown_with(body: &str, opts: &FormatOptions) -> String { + if body.is_empty() { + return String::new(); + } + let trailing_newlines = body + .as_bytes() + .iter() + .rev() + .take_while(|&&b| b == b'\n') + .count(); + + let mut text = if opts.canonical { + // Comrak's `format_commonmark` unconditionally emits links in inline + // form (`[text](url)`), dropping the user's reference definitions + // along the way. Protect reference-form links by sentinelising them + // and stashing definitions out-of-band before the canonical pass, + // then restore both afterwards. + let protection = protect_reference_form_links(body); + let canonical = match canonicalize_markdown(&protection.protected_text) { + Some(canonical) => align_tables(&canonical), + None => protection.protected_text.clone(), + }; + restore_protected_reference_links(&canonical, &protection) + } else { + body.to_owned() + }; + if opts.reference_links { + text = extract_reference_links(&text); + } + let text = reflow_markdown(&text, opts.max_width); + + // Both `canonicalize_markdown` and `extract_reference_links` track + // "trailing newline present?" but collapse multiple to one. Restore the + // exact count from the input. + let trimmed = text.trim_end_matches('\n'); + let mut out = String::with_capacity(trimmed.len() + trailing_newlines); + out.push_str(trimmed); + for _ in 0..trailing_newlines { + out.push('\n'); + } + out +} + +/// Shared implementation for the `///`-block pipeline. +/// The body processor differs between default mode, `--format-markdown`, and +/// `--reference-links`; passed in by the caller. +fn format_source_impl(source: &str, max_width: usize, process_body: F) -> String +where + F: Fn(&str, usize) -> String, +{ + let blocks = find_blocks(source); + if blocks.is_empty() { + return source.to_owned(); + } + + let mut out = String::with_capacity(source.len()); + let mut cursor = 0; + + for block in blocks { + out.push_str(&source[cursor..block.range.start]); + + let body = block.lines.join("\n"); + // Subtract the per-line prefix from the user's budget. If the + // prefix alone exceeds `max_width`, saturate to 0 (no width wrap) + // rather than wrapping every word onto its own line — the user's + // constraint is impossible here, so we degrade to pure sembr. + let effective_width = if max_width == 0 { + 0 + } else { + max_width.saturating_sub(block.prefix_width()) + }; + let formatted = process_body(&body, effective_width); + out.push_str(&block.reassemble(&formatted)); + + cursor = block.range.end; + } + + out.push_str(&source[cursor..]); + out +} + +/// Canonicalize the markdown structure of `body` (align tables, normalise list +/// markers, prefer fenced code blocks, etc.) and then reflow its paragraphs +/// with semantic line breaks. +/// +/// Canonicalisation is delegated to [`comrak::format_commonmark`] with our +/// render options (see `canonical_render_options` for the rationale); width +/// handling is left to our downstream sembr pipeline. +/// The output is the canonical markdown with paragraphs sembr'd. +/// +/// The input's trailing-newline convention is preserved: doc-comment block +/// bodies (no trailing newline) round-trip without one; markdown files (usually +/// trailing newline) keep theirs. +#[must_use] +pub fn format_markdown_canonical(body: &str, max_width: usize) -> String { + format_markdown_with(body, &FormatOptions { + max_width, + canonical: true, + ..Default::default() + }) +} + +/// Run comrak's `format_commonmark` over `body` and return the canonical +/// markdown text, with the input's trailing-newline convention preserved. +/// Returns `None` if the formatter errors — callers should fall back to the +/// input in that case. +fn canonicalize_markdown(body: &str) -> Option { + let arena = Arena::new(); + let parse_options = comrak_options(); + let root = comrak::parse_document(&arena, body, &parse_options); + + let render_options = canonical_render_options(); + let mut canonical = String::new(); + if comrak::format_commonmark(root, &render_options, &mut canonical).is_err() { + return None; + } + + // Comrak's formatter appends a trailing newline unconditionally; + // normalise to match the input's convention so the caller (block + // reassembly for `///` blocks, file writes for `.md` files) sees a + // consistent shape. + let canonical = match (body.ends_with('\n'), canonical.ends_with('\n')) { + (true, false) => canonical + "\n", + (false, true) => canonical.trim_end_matches('\n').to_owned(), + _ => canonical, + }; + + Some(canonical) +} + +/// Re-parse `text` to find markdown tables, then rewrite each one with column +/// widths padded for visual alignment. +/// The separator row's alignment markers come from the AST's [`TableAlignment`] +/// (the colon pattern in the source), not from re-scanning the text. +/// +/// Tables are identified by [`NodeValue::Table`] nodes; cell content is taken +/// from each [`NodeValue::TableCell`]'s sourcepos slice, so any inline markdown +/// (`**bold**`, `` `code` ``) and escapes (`\|`) survive verbatim. +fn align_tables(text: &str) -> String { + if text.is_empty() { + return String::new(); + } + + let arena = Arena::new(); + let options = comrak_options(); + let root = comrak::parse_document(&arena, text, &options); + let line_starts = line_start_offsets(text); + + let mut replacements: Vec = Vec::new(); + collect_table_replacements(root, text, &line_starts, &mut replacements); + + if replacements.is_empty() { + return text.to_owned(); + } + + replacements.sort_by_key(|r| r.range.start); + + let mut out = String::with_capacity(text.len()); + let mut cursor = 0; + for r in replacements { + out.push_str(&text[cursor..r.range.start]); + out.push_str(&r.text); + cursor = r.range.end; + } + out.push_str(&text[cursor..]); + out +} + +/// Walk the AST, queueing a [`Replacement`] for every table found. +fn collect_table_replacements<'a>( + node: &'a AstNode<'a>, + text: &str, + line_starts: &[usize], + out: &mut Vec, +) { + let data = node.data(); + if let NodeValue::Table(table_meta) = &data.value { + if let Some(range) = sourcepos_to_byte_range(line_starts, text.len(), &data.sourcepos) + && let Some(aligned) = + render_aligned_table(node, &table_meta.alignments, text, line_starts) + { + // Preserve the trailing newline convention of the source slice + // — if the original ended with `\n`, the replacement should + // too (and vice versa). + let original_slice = &text[range.clone()]; + let aligned = match (original_slice.ends_with('\n'), aligned.ends_with('\n')) { + (true, false) => aligned + "\n", + (false, true) => aligned.trim_end_matches('\n').to_owned(), + _ => aligned, + }; + out.push(Replacement { + range, + text: aligned, + }); + } + // Don't descend further — tables don't nest within tables in our model. + return; + } + for child in node.children() { + collect_table_replacements(child, text, line_starts, out); + } +} + +/// Build the aligned markdown text for a single table node. +/// Returns `None` if the table is malformed (no rows, mismatched cell counts, +/// sourcepos gaps) — in which case the caller falls back to leaving the source +/// unchanged. +fn render_aligned_table<'a>( + table: &'a AstNode<'a>, + alignments: &[TableAlignment], + text: &str, + line_starts: &[usize], +) -> Option { + let num_cols = alignments.len(); + if num_cols == 0 { + return None; + } + + // Walk rows → cells, slicing each cell's source bytes via its sourcepos. + let mut rows: Vec> = Vec::new(); + for row_node in table.children() { + if !matches!(row_node.data().value, NodeValue::TableRow(_)) { + continue; + } + let mut cells: Vec = Vec::new(); + for cell_node in row_node.children() { + if !matches!(cell_node.data().value, NodeValue::TableCell) { + continue; + } + let cell_range = + sourcepos_to_byte_range(line_starts, text.len(), &cell_node.data().sourcepos)?; + // Trim the cell's source slice. Comrak's cell sourcepos usually + // covers the content between the `|` delimiters with any leading + // and trailing spaces, but trimming defensively handles both + // shapes. + let raw = text[cell_range].trim(); + cells.push(raw.to_owned()); + } + rows.push(cells); + } + + if rows.is_empty() { + return None; + } + + // Column widths: max display width per column, with a floor of 3 so + // the separator row's alignment markers (`:-:`, `---`) always fit. + // `UnicodeWidthStr::width` gives terminal-cell width — wide chars (CJK) + // count as 2, zero-width chars as 0, which matches what a human eye + // sees when scanning a column. + let mut col_widths = vec![3_usize; num_cols]; + for row in &rows { + for (col, cell) in row.iter().enumerate() { + if col < num_cols { + col_widths[col] = col_widths[col].max(UnicodeWidthStr::width(cell.as_str())); + } + } + } + + // Emit. GFM tables: row 0 is the header; the separator row follows + // (synthesised from `alignments`); remaining rows are data rows. + let mut out = String::new(); + for (row_idx, row) in rows.iter().enumerate() { + emit_data_row(&mut out, row, &col_widths, alignments, num_cols); + if row_idx == 0 { + emit_separator_row(&mut out, &col_widths, alignments, num_cols); + } + } + + Some(out) +} + +fn emit_data_row( + out: &mut String, + row: &[String], + col_widths: &[usize], + alignments: &[TableAlignment], + num_cols: usize, +) { + out.push('|'); + for col in 0..num_cols { + let cell = row.get(col).map_or("", String::as_str); + let padded = pad_cell(cell, col_widths[col], alignments[col]); + out.push(' '); + out.push_str(&padded); + out.push_str(" |"); + } + out.push('\n'); +} + +fn emit_separator_row( + out: &mut String, + col_widths: &[usize], + alignments: &[TableAlignment], + num_cols: usize, +) { + out.push('|'); + for col in 0..num_cols { + let w = col_widths[col]; + let sep = match alignments[col] { + // The colon-or-not pattern encodes alignment; width = `w`. + TableAlignment::Left => format!(":{}", "-".repeat(w.saturating_sub(1))), + TableAlignment::Right => format!("{}:", "-".repeat(w.saturating_sub(1))), + TableAlignment::Center => format!(":{}:", "-".repeat(w.saturating_sub(2))), + TableAlignment::None => "-".repeat(w), + }; + out.push(' '); + out.push_str(&sep); + out.push_str(" |"); + } + out.push('\n'); +} + +fn pad_cell(content: &str, width: usize, alignment: TableAlignment) -> String { + let content_width = UnicodeWidthStr::width(content); + let pad = width.saturating_sub(content_width); + match alignment { + TableAlignment::Right => format!("{}{content}", " ".repeat(pad)), + TableAlignment::Center => { + let left = pad / 2; + let right = pad - left; + format!("{}{content}{}", " ".repeat(left), " ".repeat(right)) + } + TableAlignment::Left | TableAlignment::None => { + format!("{content}{}", " ".repeat(pad)) + } + } +} + +// --------------------------------------------------------------------------- +// `--reference-links`: convert inline links to reference style and +// consolidate definitions at the bottom of the body. +// --------------------------------------------------------------------------- + +/// Convert inline markdown links to reference-style links and move all +/// reference definitions to the bottom of `text`. +/// +/// Adaptive label strategy: +/// +/// - Shortcut form `[text]` when the link's text can serve as a unique label. +/// - Full form `[text][label]` when text collides with an already-used label +/// for a different URL (label gets a `-N` suffix). +/// +/// Pre-existing scattered reference definitions are also moved to the bottom +/// and sorted alphabetically. +fn extract_reference_links(text: &str) -> String { + if text.is_empty() { + return String::new(); + } + let had_trailing_newline = text.ends_with('\n'); + + // Pull out any existing `[label]: url "title"` definitions; the result + // is the text minus those lines, plus a list of definitions. + let (text_without_defs, existing_defs) = extract_existing_reference_definitions(text); + + // Seed the label map with existing definitions so newly converted + // inline links can reuse them via full-form references. + let mut label_map = LabelMap::default(); + let mut all_defs: Vec = Vec::new(); + for def in existing_defs { + label_map.register(&def); + all_defs.push(def); + } + + // Walk the AST for inline `Link` nodes and queue conversions. Each new + // definition gets appended to `all_defs` as it's discovered. + let arena = Arena::new(); + let options = comrak_options(); + let root = comrak::parse_document(&arena, &text_without_defs, &options); + let line_starts = line_start_offsets(&text_without_defs); + + let mut replacements: Vec = Vec::new(); + collect_inline_link_replacements( + root, + &text_without_defs, + &line_starts, + &mut label_map, + &mut all_defs, + &mut replacements, + ); + + // Splice link replacements into the text. + let text_after = if replacements.is_empty() { + text_without_defs + } else { + replacements.sort_by_key(|r| r.range.start); + let mut out = String::with_capacity(text_without_defs.len()); + let mut cursor = 0; + for r in &replacements { + out.push_str(&text_without_defs[cursor..r.range.start]); + out.push_str(&r.text); + cursor = r.range.end; + } + out.push_str(&text_without_defs[cursor..]); + out + }; + + // Append all definitions, sorted alphabetically by label, at the + // bottom of the body with a blank-line separator. + let result = if all_defs.is_empty() { + text_after + } else { + all_defs.sort_by(|a, b| a.label.cmp(&b.label)); + let mut result = text_after.trim_end().to_owned(); + if !result.is_empty() { + result.push_str("\n\n"); + } + for def in &all_defs { + result.push_str(&def.render()); + result.push('\n'); + } + // Strip the trailing `\n` we just added; the trailing-newline + // adjustment below will put one back if the input had one. + result.trim_end_matches('\n').to_owned() + }; + + if had_trailing_newline && !result.ends_with('\n') { + format!("{result}\n") + } else { + result + } +} + +/// A single CommonMark reference-link definition. +/// `title` is empty when the definition has no title; otherwise it's the +/// unescaped title text (matching how comrak hands us inline-link titles). +#[derive(Debug, Clone, PartialEq, Eq)] +struct LinkDef { + label: String, + url: String, + title: String, +} + +impl LinkDef { + /// Render as a definition line, without the trailing newline. + /// Uses double-quoted title syntax; embedded backslashes and double quotes + /// are backslash-escaped (CommonMark allows backslash escapes in titles). + fn render(&self) -> String { + if self.title.is_empty() { + format!("[{}]: {}", self.label, self.url) + } else { + let escaped = self.title.replace('\\', r"\\").replace('"', r#"\""#); + format!("[{}]: {} \"{escaped}\"", self.label, self.url) + } + } +} + +/// Normalize a reference label per CommonMark §4.7: Unicode case fold, then +/// trim outer whitespace and collapse internal whitespace runs to a single +/// space. +/// Two labels match when their normalized forms are equal. +/// +/// We use `str::to_lowercase` as a pragmatic stand-in for full Unicode case +/// folding — it covers ASCII and the Latin/Cyrillic/Greek scripts that show up +/// in practice, without pulling in a new dependency. +fn normalize_label(label: &str) -> String { + label + .to_lowercase() + .split_whitespace() + .collect::>() + .join(" ") +} + +/// Label registry that tracks bidirectional label ↔ (url, title) mapping. +/// Used to decide whether a new inline link can reuse an existing definition +/// (shortcut form, full form, or a fresh definition). +/// +/// `by_label` is keyed by the *normalized* label (CommonMark §4.7 — +/// case-insensitive, whitespace-folded), so an existing `[Foo]: /old` collides +/// with an inline `[foo](/new)` as the renderer would: without that, we'd emit +/// two definitions sharing one canonical label and the renderer would resolve +/// the converted shortcut to whichever came first. +/// +/// `by_url_title` keys on the literal `(url, title)` tuple so two links +/// pointing at the same URL with different titles get distinct definitions — +/// otherwise the title metadata of one would be silently dropped. +/// Its values are the *original-cased* labels, so full-form references write +/// `[text][Foo]` (the casing the definition is stored under) rather than the +/// normalized form. +#[derive(Debug, Default)] +struct LabelMap { + by_label: HashMap, + by_url_title: HashMap<(String, String), String>, +} + +impl LabelMap { + /// Register a definition. + /// If the `(url, title)` pair doesn't already have a canonical label, this + /// one becomes it. + fn register(&mut self, def: &LinkDef) { + self.by_label + .entry(normalize_label(&def.label)) + .or_insert_with(|| (def.url.clone(), def.title.clone())); + self.by_url_title + .entry((def.url.clone(), def.title.clone())) + .or_insert_with(|| def.label.clone()); + } + + /// Resolve an inline `[text](url "title")` link to its reference-form + /// replacement and, if a new definition was needed, append it to `defs`. + fn resolve_inline( + &mut self, + text: &str, + url: &str, + title: &str, + defs: &mut Vec, + ) -> String { + // (url, title) already has a canonical label? + let key = (url.to_owned(), title.to_owned()); + if let Some(existing_label) = self.by_url_title.get(&key) { + let existing_label = existing_label.clone(); + return if existing_label == text { + format!("[{text}]") + } else { + format!("[{text}][{existing_label}]") + }; + } + // New (url, title) — pick a label. Use the link text if its + // normalized form is free; otherwise disambiguate with a numeric + // suffix. Collision checks go through `normalize_label` so we don't + // emit `[foo]: /new` next to an existing `[Foo]: /old`. + let label = if self.by_label.contains_key(&normalize_label(text)) { + let mut i = 2_usize; + loop { + let candidate = format!("{text}-{i}"); + if !self.by_label.contains_key(&normalize_label(&candidate)) { + break candidate; + } + i += 1; + } + } else { + text.to_owned() + }; + self.by_label + .insert(normalize_label(&label), (url.to_owned(), title.to_owned())); + self.by_url_title.insert(key, label.clone()); + defs.push(LinkDef { + label: label.clone(), + url: url.to_owned(), + title: title.to_owned(), + }); + if label == text { + format!("[{text}]") + } else { + format!("[{text}][{label}]") + } + } +} + +/// Walk the AST for inline `Link` nodes. +/// For each, queue a [`Replacement`] of its source bytes with the +/// reference-form output. +/// Anchor links (`#fragment`), images, autolinks, and pre-existing +/// reference-form links are left alone. +fn collect_inline_link_replacements<'a>( + node: &'a AstNode<'a>, + text: &str, + line_starts: &[usize], + label_map: &mut LabelMap, + defs: &mut Vec, + out: &mut Vec, +) { + let data = node.data(); + match &data.value { + NodeValue::Link(link) => { + // Skip anchor-only URLs (`#foo`) and images (Image is its own + // NodeValue variant so the match below handles that). + if !link.url.starts_with('#') + && let Some(range) = + sourcepos_to_byte_range(line_starts, text.len(), &data.sourcepos) + && let Some(link_text) = parse_inline_link_text(&text[range.clone()]) + { + let replacement = + label_map.resolve_inline(&link_text, &link.url, &link.title, defs); + out.push(Replacement { + range, + text: replacement, + }); + } + // Don't descend into Link children — they're inlines that get + // included in the replacement text already. + return; + } + NodeValue::Image(_) => { + // Per design: leave images as inline `![alt](url)`. Don't recurse. + return; + } + _ => {} + } + for child in node.children() { + collect_inline_link_replacements(child, text, line_starts, label_map, defs, out); + } +} + +/// If `slice` is the source of an inline-form link `[text](url)`, return the +/// raw text between `[` and `](`. +/// Returns `None` for reference-form links (`[text][label]`, `[label][]`, +/// `[label]`) and for autolinks. +fn parse_inline_link_text(slice: &str) -> Option { + let bytes = slice.as_bytes(); + if bytes.first() != Some(&b'[') { + return None; + } + let mut depth = 0_i32; + let mut i = 0_usize; + while i < bytes.len() { + match bytes[i] { + b'\\' => { + // Backslash-escape: skip the next byte too. + i += 2; + continue; + } + b'[' => depth += 1, + b']' => { + depth -= 1; + if depth == 0 { + // Inline form requires `(` immediately after the + // matched `]`. Anything else (`[`, end-of-slice, + // whitespace) is a reference form or invalid. + if bytes.get(i + 1) == Some(&b'(') { + // Text is between the opening `[` (index 0) and the + // closing `]` (index i). + return Some(slice[1..i].to_owned()); + } + return None; + } + } + _ => {} + } + i += 1; + } + None +} + +/// Find pre-existing reference definitions in `text` (lines of the form +/// `[label]: url`) at the document level, returning the text with those lines +/// removed and a list of the extracted `(label, url)` pairs. +/// +/// Lines inside fenced code blocks and HTML blocks are skipped, identified via +/// comrak's AST so we don't false-match content that just happens to look like +/// a definition. +fn extract_existing_reference_definitions(text: &str) -> (String, Vec) { + let arena = Arena::new(); + let options = comrak_options(); + let root = comrak::parse_document(&arena, text, &options); + let line_starts = line_start_offsets(text); + + let mut excluded: Vec> = Vec::new(); + collect_excluded_ranges_for_refdefs(root, text, &line_starts, &mut excluded); + + let mut content_lines: Vec<&str> = Vec::new(); + let mut defs: Vec = Vec::new(); + let mut byte_pos = 0_usize; + + for line in text.split('\n') { + let line_start = byte_pos; + let line_end = byte_pos + line.len(); + let in_excluded = excluded + .iter() + .any(|r| line_start >= r.start && line_end <= r.end); + if !in_excluded && let Some(def) = parse_reference_definition_line(line) { + defs.push(def); + } else { + content_lines.push(line); + } + // Advance past line and its trailing `\n` (if any). + byte_pos = line_end + 1; + } + + (content_lines.join("\n"), defs) +} + +/// Walk the AST for block ranges where a `[label]: url` shape must NOT be +/// extracted as a reference definition. +/// +/// - [`CodeBlock`] / [`HtmlBlock`]: the bracket pattern is literal content. +/// - [`Paragraph`]: CommonMark forbids reference definitions from interrupting +/// a paragraph, so a `[label]: url` line that comrak parsed as part of a +/// paragraph's sourcepos is visible prose, not a definition. +/// +/// [`CodeBlock`]: NodeValue::CodeBlock +/// [`HtmlBlock`]: NodeValue::HtmlBlock +/// [`Paragraph`]: NodeValue::Paragraph +fn collect_excluded_ranges_for_refdefs<'a>( + node: &'a AstNode<'a>, + text: &str, + line_starts: &[usize], + out: &mut Vec>, +) { + let data = node.data(); + if matches!( + data.value, + NodeValue::CodeBlock(_) | NodeValue::HtmlBlock(_) | NodeValue::Paragraph + ) && let Some(range) = sourcepos_to_byte_range(line_starts, text.len(), &data.sourcepos) + { + out.push(range); + // Paragraphs have only inline children; code and HTML blocks are + // leaves. No further recursion needed. + return; + } + for child in node.children() { + collect_excluded_ranges_for_refdefs(child, text, line_starts, out); + } +} + +/// Parse a single line as a CommonMark-ish reference definition `[label]: url +/// "title"`. +/// Title is optional and may be enclosed in `"..."`, `'...'`, or `(...)`. +/// Backslash escapes inside the title are unescaped (CommonMark semantics) so +/// the stored value matches how comrak gives us inline-link titles. +/// Returns `None` for lines that don't match the reference-definition shape. +/// +/// Multi-line titles (where the title sits on the line after the URL) are +/// **not** supported here; this matches the rest of the pipeline, which only +/// extracts same-line definitions. +fn parse_reference_definition_line(line: &str) -> Option { + let trimmed = line.trim_start(); + let indent = line.len() - trimmed.len(); + // CommonMark allows up to 3 spaces of indentation. + if indent > 3 || !trimmed.starts_with('[') { + return None; + } + + // Find the matching `]`, allowing nested `[...]` inside the label. + let bytes = trimmed.as_bytes(); + let mut depth = 0_i32; + let mut close = None; + let mut i = 0_usize; + while i < bytes.len() { + match bytes[i] { + b'\\' => { + i += 2; + continue; + } + b'[' => depth += 1, + b']' => { + depth -= 1; + if depth == 0 { + close = Some(i); + break; + } + } + _ => {} + } + i += 1; + } + let close = close?; + let label = &trimmed[1..close]; + if label.is_empty() { + return None; + } + // Footnote definitions (`[^label]: ...`) are handled by the + // footnotes extension, not as regular reference definitions. If we + // extracted them here, the protection round-trip would strip them + // before the canonical pass and re-emit them at the bottom — by + // which time comrak has parsed `[^label]` in prose as an undefined + // reference and escaped it as `[^label]`. + if label.starts_with('^') { + return None; + } + + let after = &trimmed[close + 1..]; + let after = after.strip_prefix(':')?.trim_start(); + if after.is_empty() { + return None; + } + + // Split URL from optional title. The URL is either `<...>` or the first + // run of non-whitespace bytes; the title (if any) follows after + // whitespace. + let (url, rest) = if let Some(after_lt) = after.strip_prefix('<') { + let end = after_lt.find('>')?; + (after_lt[..end].to_owned(), &after_lt[end + 1..]) + } else { + let end = after.find(char::is_whitespace).unwrap_or(after.len()); + (after[..end].to_owned(), &after[end..]) + }; + if url.is_empty() { + return None; + } + + let rest = rest.trim(); + let title = if rest.is_empty() { + String::new() + } else { + // If the trailing text isn't a well-formed title, treat the line as + // not a reference definition at all — trailing junk would otherwise + // round-trip lossily. + parse_quoted_title(rest)? + }; + + Some(LinkDef { + label: label.to_owned(), + url, + title, + }) +} + +/// Parse a CommonMark reference-definition title. +/// Accepts the three CommonMark forms: `"..."`, `'...'`, or `(...)`. +/// Backslash escapes inside the title are unescaped. +fn parse_quoted_title(s: &str) -> Option { + let bytes = s.as_bytes(); + let (open, close) = match bytes.first()? { + b'"' => (b'"', b'"'), + b'\'' => (b'\'', b'\''), + b'(' => (b'(', b')'), + _ => return None, + }; + // The closing delimiter must be the last byte. `"..."trailing` is not + // a well-formed title. + if bytes.len() < 2 || bytes[bytes.len() - 1] != close { + return None; + } + let inner = &s[1..s.len() - 1]; + // Reject unbalanced delimiters of the same kind inside the body — e.g. + // `"foo"bar"` would otherwise parse as `foo"bar`. For parens we don't + // try to balance properly; nested unescaped parens are rare in titles. + let mut unescaped = String::with_capacity(inner.len()); + let mut chars = inner.chars(); + while let Some(c) = chars.next() { + if c == '\\' { + if let Some(next) = chars.next() { + unescaped.push(next); + } + continue; + } + if c as u32 == u32::from(open) && open == close { + return None; + } + unescaped.push(c); + } + Some(unescaped) +} + +// --------------------------------------------------------------------------- +// Reference-form link protection across the canonical pass. +// +// Comrak's `format_commonmark` always emits links inline (`[text](url)`) +// regardless of how they appeared in the source. It also drops orphaned +// reference definitions once all references have been inlined. To preserve +// the user's choice of reference form (and their label names), we wrap the +// canonical pass with two helpers: +// +// 1. `protect_reference_form_links`: substitute citations with alphanumeric +// sentinels and stash definitions out-of-band. +// 2. `restore_protected_reference_links`: replace sentinels with original +// citation bytes and re-append definitions at the end of the body. +// +// The sentinels are bare alphanumeric strings, which comrak treats as plain +// text and emits verbatim through its parse + serialize cycle. +// --------------------------------------------------------------------------- + +struct LinkProtection { + /// Sentinel-substituted text fed to the canonical pass. + protected_text: String, + /// For each citation: (sentinel string, original source bytes). + citations: Vec<(String, String)>, + /// Original reference-definition lines, in source order, to re-append after + /// canonical. + /// The text-without-defs is what we sentinelise and pass to the canonical + /// pass. + definitions: Vec, +} + +fn protect_reference_form_links(text: &str) -> LinkProtection { + if text.is_empty() { + return LinkProtection { + protected_text: String::new(), + citations: Vec::new(), + definitions: Vec::new(), + }; + } + + let arena = Arena::new(); + let options = comrak_options_with_intra_doc_links(); + let root = comrak::parse_document(&arena, text, &options); + let line_starts = line_start_offsets(text); + + // Collect citation source ranges (reference-form links only — inline + // links and autolinks are left alone). + let mut citation_ranges: Vec> = Vec::new(); + collect_reference_form_link_ranges(root, text, &line_starts, &mut citation_ranges); + + // Collect reference definition line ranges, excluding code blocks and + // HTML blocks (where `[label]: url` patterns are content, not defs). + let mut excluded: Vec> = Vec::new(); + collect_excluded_ranges_for_refdefs(root, text, &line_starts, &mut excluded); + + let mut definitions: Vec = Vec::new(); + let mut definition_ranges: Vec> = Vec::new(); + let mut byte_pos = 0_usize; + for line in text.split('\n') { + let line_start = byte_pos; + let line_end = byte_pos + line.len(); + let in_excluded = excluded + .iter() + .any(|r| line_start >= r.start && line_end <= r.end); + if !in_excluded && parse_reference_definition_line(line).is_some() { + // Include the trailing newline (if any) so the line and its + // separator are both removed cleanly. + let range_end = if line_end < text.len() { + line_end + 1 + } else { + line_end + }; + definition_ranges.push(line_start..range_end); + definitions.push(line.to_owned()); + } + byte_pos = line_end + 1; + } + + // Build the sentinel-substituted text. + let mut substitutions: Vec<(Range, String)> = Vec::new(); + let mut citations: Vec<(String, String)> = Vec::new(); + for range in citation_ranges { + let sentinel = format!("XCMFRTLR{:04}X", citations.len()); + let original = text[range.clone()].to_owned(); + substitutions.push((range, sentinel.clone())); + citations.push((sentinel, original)); + } + for range in definition_ranges { + substitutions.push((range, String::new())); + } + substitutions.sort_by_key(|(r, _)| r.start); + + let mut protected_text = String::with_capacity(text.len()); + let mut cursor = 0_usize; + for (range, replacement) in substitutions { + protected_text.push_str(&text[cursor..range.start]); + protected_text.push_str(&replacement); + cursor = range.end; + } + protected_text.push_str(&text[cursor..]); + + LinkProtection { + protected_text, + citations, + definitions, + } +} + +fn restore_protected_reference_links(canonical: &str, protection: &LinkProtection) -> String { + let had_trailing_newline = canonical.ends_with('\n'); + let mut text = canonical.to_owned(); + + // Step 1: replace sentinels with original citation source. + for (sentinel, original) in &protection.citations { + text = text.replace(sentinel, original); + } + + // Step 2: re-append definitions at the bottom (separated by a blank + // line). If `--reference-links` is also enabled, the subsequent + // `extract_reference_links` pass will re-sort and consolidate. + if !protection.definitions.is_empty() { + let trimmed = text.trim_end(); + let mut result = trimmed.to_owned(); + if !result.is_empty() { + result.push_str("\n\n"); + } + for def in &protection.definitions { + result.push_str(def); + result.push('\n'); + } + text = result.trim_end_matches('\n').to_owned(); + } + + if had_trailing_newline && !text.ends_with('\n') { + text.push('\n'); + } else if !had_trailing_newline { + text = text.trim_end_matches('\n').to_owned(); + } + + text +} + +/// Walk the AST for [`NodeValue::Link`] nodes whose source slice is +/// reference-form (`[text][label]`, `[label][]`, or shortcut `[label]`). +/// Skips inline links, autolinks, and images. +fn collect_reference_form_link_ranges<'a>( + node: &'a AstNode<'a>, + text: &str, + line_starts: &[usize], + out: &mut Vec>, +) { + let data = node.data(); + match &data.value { + NodeValue::Link(_) => { + if let Some(range) = sourcepos_to_byte_range(line_starts, text.len(), &data.sourcepos) + && is_reference_form_link(&text[range.clone()]) + { + out.push(range); + } + return; + } + NodeValue::Image(_) => { + // Don't recurse into images. Reference-form images would also be + // inlined by comrak, but extending protection to them is a + // separate concern — the present bug is link-only. + return; + } + _ => {} + } + for child in node.children() { + collect_reference_form_link_ranges(child, text, line_starts, out); + } +} + +/// Returns `true` when the source slice is the source of a reference-form link. +/// Inline links (slice ends with `](url)`) and autolinks (slice starts with +/// `<`) return `false`. +fn is_reference_form_link(slice: &str) -> bool { + let bytes = slice.as_bytes(); + if bytes.first() != Some(&b'[') { + // Autolink `` or some other non-bracket-prefixed link. + return false; + } + let mut depth = 0_i32; + let mut i = 0_usize; + while i < bytes.len() { + match bytes[i] { + b'\\' => { + i += 2; + continue; + } + b'[' => depth += 1, + b']' => { + depth -= 1; + if depth == 0 { + // Inline form would have `(` immediately after the + // matched `]`. Anything else (`[`, EOL, whitespace) is + // reference form. + return bytes.get(i + 1) != Some(&b'('); + } + } + _ => {} + } + i += 1; + } + false +} + +/// Render options for canonical-markdown output: comrak's defaults with our +/// tweaks. +/// +/// `width = usize::MAX` is deliberate. +/// Counter-intuitively, `width = 0` makes comrak's formatter *preserve source +/// soft breaks within paragraphs*, which leaves digit-period sequences (`404.`) +/// and other otherwise-meaningful characters at the start of continuation +/// lines. comrak then defensively escapes them (`404\.`) so that re-parsing the +/// canonical output produces the same AST. +/// The escapes are visible to the user as cosmetic noise. +/// +/// Setting `width = usize::MAX` makes comrak collapse soft breaks: each +/// paragraph emits as one logical line, putting those characters mid-line where +/// no escape is needed. +/// Our downstream sembr pass then handles width-wrapping, so the lost soft +/// breaks are immediately replaced with sentence-per-line layout. +/// +/// The other choices match `jp_md`'s existing conventions. +fn canonical_render_options() -> Options<'static> { + let mut options = comrak_options(); + options.render = Render { + width: usize::MAX, + list_style: ListStyleType::Dash, + prefer_fenced: true, + ..Default::default() + }; + options +} + +/// Replace every top-level paragraph in a markdown body with its reflowed +/// version. +/// Other block types are left as-is. +#[must_use] +pub fn reflow_markdown(body: &str, max_width: usize) -> String { + if body.is_empty() { + return String::new(); + } + + let arena = Arena::new(); + let options = comrak_options(); + let root = comrak::parse_document(&arena, body, &options); + + let line_starts = line_start_offsets(body); + let mut replacements: Vec = Vec::new(); + let mut ancestors: Vec<&AstNode<'_>> = Vec::new(); + collect_paragraphs( + root, + &mut ancestors, + &mut replacements, + body, + &line_starts, + max_width, + ); + + if replacements.is_empty() { + return body.to_owned(); + } + + // Comrak doesn't guarantee AST order matches source order: footnote + // definitions in particular get reordered (the definition appears in + // the AST after the paragraph that references it, regardless of where + // it lived in the source). Sort by source byte offset before splicing + // so the cursor walks the body in monotonic order. + replacements.sort_by_key(|r| r.range.start); + + let mut out = String::with_capacity(body.len()); + let mut cursor = 0; + for r in replacements { + out.push_str(&body[cursor..r.range.start]); + out.push_str(&r.text); + cursor = r.range.end; + } + out.push_str(&body[cursor..]); + out +} + +/// Resolver that turns unresolved shortcut/collapsed references (`[X]` or +/// `[X][]`) into dummy `Link` AST nodes — specifically the ones that look like +/// Rust intra-doc links (`[`foo`]`, `[crate::Foo]`, etc.). +/// Without this, comrak's parser treats unresolved references as plain text +/// with literal `[` and `]`, which the formatter then defensively escapes as +/// `[X]`. +/// By forcing intra-doc-like labels to be `Link` nodes, +/// [`protect_reference_form_links`] can sentinelise their source bytes and +/// bypass comrak's escape logic entirely. +/// +/// **Critically narrow.** The callback must *not* match task-list markers (`[ +/// ]`, `[x]`, `[X]`) or footnote references (`[^name]`): `broken_link_callback` +/// fires before the `tasklist` / `footnotes` extensions get to recognise them, +/// so a too-eager callback eats task items and footnotes silently. +/// Returning `None` for those patterns lets the extensions handle them. +/// +/// The dummy URL is empty; the value never reaches output because protection +/// substitutes the source bytes back verbatim. +struct ResolveIntraDocLinks; + +impl BrokenLinkCallback for ResolveIntraDocLinks { + fn resolve(&self, link: BrokenLinkReference<'_>) -> Option { + let label = link.normalized.trim(); + // Footnote references: handled by the footnotes extension. + if label.starts_with('^') { + return None; + } + // Task-list markers: `[ ]` normalises to empty, `[x]` / `[X]` + // normalise to single characters. Let the tasklist extension + // recognise them. + if label.is_empty() || label.eq_ignore_ascii_case("x") { + return None; + } + Some(ResolvedReference { + url: String::new(), + title: String::new(), + }) + } +} + +/// Build the comrak parse options used throughout the pipeline. +/// Factored out so the re-parse for block-quote-nested paragraphs (see +/// [`collect_inline_atomic_ranges_from_text`]) uses the exact same extension +/// set. +/// +/// Note: this is the *plain* parse options without the intra-doc broken-link +/// callback. +/// The callback would interfere with the tasklist and footnotes extensions (see +/// [`ResolveIntraDocLinks`]). +/// Use [`comrak_options_with_intra_doc_links`] only where the callback's effect +/// is genuinely needed — currently only [`protect_reference_form_links`]. +fn comrak_options() -> Options<'static> { + Options { + extension: Extension { + table: true, + tasklist: true, + alerts: true, + multiline_block_quotes: true, + footnotes: true, + block_directive: true, + // Detect YAML frontmatter (`---` at the top of a document). + // Required for markdown files; benign for doc comments because + // frontmatter only triggers when the first non-empty line of + // the document is the delimiter, which is almost never the case + // inside a `///` block. + front_matter_delimiter: Some("---".to_owned()), + ..Default::default() + }, + ..Default::default() + } +} + +/// Parse options with the [`ResolveIntraDocLinks`] callback enabled, so +/// unresolved intra-doc shortcut/collapsed references become `Link` nodes in +/// the AST. +/// Used exclusively by [`protect_reference_form_links`] to find these +/// references and sentinelise their source bytes. +fn comrak_options_with_intra_doc_links() -> Options<'static> { + let mut options = comrak_options(); + options.parse = Parse { + broken_link_callback: Some(Arc::new(ResolveIntraDocLinks)), + ..Default::default() + }; + options +} + +/// Recursively walk the AST collecting paragraphs to reflow. +/// +/// Descends into the container types matched explicitly below. +/// Other containers (e.g. +/// `DescriptionList`) and leaf blocks (code blocks, headings, tables, HTML +/// blocks) are skipped, so their content survives verbatim. +/// +/// Paragraphs that contain a [`LineBreak`] inline child — i.e. an explicit +/// markdown hard break — are also left untouched. +/// `collapse_whitespace` in the sembr step would otherwise silently eat the +/// hard-break marker, changing how rustdoc renders the paragraph. +/// The same coarse-grained rule we apply to code blocks and tables: when reflow +/// would lose information, opt out of reflow for the whole element. +/// +/// [`LineBreak`]: NodeValue::LineBreak +fn collect_paragraphs<'a>( + node: &'a AstNode<'a>, + ancestors: &mut Vec<&'a AstNode<'a>>, + out: &mut Vec, + body: &str, + line_starts: &[usize], + max_width: usize, +) { + let data = node.data(); + match &data.value { + NodeValue::Paragraph => { + // Hard breaks (` \n` or `\\\n`) mean the user deliberately + // chose where lines break; reflowing would silently destroy + // that intent. Leave the paragraph verbatim and skip ahead. + if has_hard_line_break(node) { + return; + } + let Some(range) = sourcepos_to_byte_range(line_starts, body.len(), &data.sourcepos) + else { + return; + }; + let prefix = continuation_prefix_from_ancestors(ancestors); + let paragraph_max = if max_width == 0 { + 0 + } else { + max_width.saturating_sub(prefix.len()) + }; + // The paragraph's source bytes include the `>` continuation + // markers on continuation lines (block quotes only — list-item + // continuation is plain whitespace that `collapse_whitespace` + // already eats). Strip them before sembr. + let bq_depth = block_quote_depth(ancestors); + let cleaned = strip_block_quote_markers(&body[range.clone()], bq_depth); + // Atomic-range protection from the inline AST. The outer AST's + // sourcepos values are in *body* coordinates, which align with + // `cleaned` only when no stripping happened (block-quote depth + // zero). For nested-in-blockquote paragraphs, the cleaner + // approach is to re-parse `cleaned` as a standalone markdown + // fragment and read inline sourcepos from that AST — those + // values are in cleaned coordinates by construction. + let atomic_ranges = if bq_depth == 0 { + collect_inline_atomic_ranges(node, range.start, line_starts, body.len()) + } else { + collect_inline_atomic_ranges_from_text(&cleaned) + }; + let raw = reflow_paragraph(&cleaned, &atomic_ranges, paragraph_max); + let text = if prefix.is_empty() { + raw + } else { + raw.replace('\n', &format!("\n{prefix}")) + }; + out.push(Replacement { range, text }); + // Paragraph's children are inlines, not blocks — no further + // recursion needed. + } + NodeValue::Document + | NodeValue::BlockQuote + | NodeValue::List(_) + | NodeValue::Item(_) + | NodeValue::TaskItem(_) + | NodeValue::Alert(_) + | NodeValue::MultilineBlockQuote(_) + | NodeValue::FootnoteDefinition(_) + | NodeValue::BlockDirective(_) => { + ancestors.push(node); + for child in node.children() { + collect_paragraphs(child, ancestors, out, body, line_starts, max_width); + } + ancestors.pop(); + } + _ => { + // Unsupported container or non-reflowable leaf block. Preserve + // verbatim by not descending; any nested paragraphs inside (e.g. + // inside a FootnoteDefinition or DescriptionList) stay as-is. + } + } +} + +/// Build the continuation-prefix string from the chain of ancestor nodes +/// surrounding a paragraph. +/// Each supported container contributes a fragment; unsupported ancestors +/// contribute nothing. +fn continuation_prefix_from_ancestors(ancestors: &[&AstNode<'_>]) -> String { + let mut prefix = String::new(); + for (i, ancestor) in ancestors.iter().enumerate() { + match &ancestor.data().value { + // Alert (GFM `> [!NOTE]`) shares BlockQuote's per-line `>` + // prefix. MultilineBlockQuote (`>>>`) has its delimiters on + // their own lines and unprefixed content inside, so it falls + // through to the wildcard arm and contributes nothing. + NodeValue::BlockQuote | NodeValue::Alert(_) => prefix.push_str("> "), + NodeValue::Item(node_list) => { + // `padding` is the marker width including the trailing space, + // per comrak's NodeList documentation. + for _ in 0..node_list.padding { + prefix.push(' '); + } + } + // Footnote definition: continuation indent is fixed at 4 spaces + // per CommonMark's footnotes extension. + NodeValue::FootnoteDefinition(_) => prefix.push_str(" "), + NodeValue::TaskItem(_) => { + // TaskItem has no padding of its own. Inherit the parent + // List's padding (marker width) and add 4 for `[X] `. + if i > 0 + && let NodeValue::List(node_list) = &ancestors[i - 1].data().value + { + for _ in 0..node_list.padding { + prefix.push(' '); + } + } + prefix.push_str(" "); + } + _ => {} + } + } + prefix +} + +/// Returns `true` if the paragraph has at least one inline [`LineBreak`] (a +/// markdown hard break) anywhere in its subtree. +/// +/// Hard breaks can live nested inside inline containers (emphasis, link text, +/// etc.). +/// A direct-children check misses those: the paragraph would then reflow, +/// `walk_inline_for_atomic_ranges` would treat the outer span as atomic, and +/// `fold_line_breaks` would collapse the hard break into a space. +/// +/// [`LineBreak`]: NodeValue::LineBreak +fn has_hard_line_break<'a>(paragraph: &'a AstNode<'a>) -> bool { + paragraph + .descendants() + .any(|n| matches!(n.data().value, NodeValue::LineBreak)) +} + +/// Count ancestors that introduce a per-line `>` marker (regular block quotes +/// and GFM alerts), so we know how many layers of `>` to strip from +/// continuation lines before sembr. +fn block_quote_depth(ancestors: &[&AstNode<'_>]) -> usize { + ancestors + .iter() + .filter(|a| matches!(a.data().value, NodeValue::BlockQuote | NodeValue::Alert(_))) + .count() +} + +/// Remove leading `>` block-quote markers from each line after the first, up to +/// `depth` layers per line. +/// Leaves line 0 alone (its prefix is outside the paragraph's sourcepos range +/// already). +/// +/// Tolerant of both ` > ` and bare `>` markers, and of leading whitespace +/// before each marker (CommonMark allows up to 3 spaces of indent). +fn strip_block_quote_markers(text: &str, depth: usize) -> String { + if depth == 0 { + return text.to_owned(); + } + let mut out = String::with_capacity(text.len()); + for (i, line) in text.split('\n').enumerate() { + if i == 0 { + out.push_str(line); + continue; + } + out.push('\n'); + let mut rest = line; + for _ in 0..depth { + rest = rest.trim_start(); + if let Some(after) = rest.strip_prefix("> ") { + rest = after; + } else if let Some(after) = rest.strip_prefix('>') { + rest = after; + } else { + break; + } + } + out.push_str(rest); + } + out +} + +/// Walk a [`Paragraph`]'s inline subtree and collect byte ranges (in the +/// original body) for inline elements that must be treated as indivisible +/// during sentence segmentation. +/// The set covers all emphasis variants (`Emph`, `Strong`, `Strikethrough`), +/// inline code (`Code`), links and images, raw HTML, math spans, footnote +/// references, and wikilinks. +/// Once a node matches, recursion stops at that subtree: nested emphasis inside +/// a link is already covered by the outer link's range. +/// +/// [`Paragraph`]: NodeValue::Paragraph +fn collect_inline_atomic_ranges<'a>( + paragraph: &'a AstNode<'a>, + paragraph_start: usize, + line_starts: &[usize], + body_len: usize, +) -> Vec> { + let mut out = Vec::new(); + for child in paragraph.children() { + walk_inline_for_atomic_ranges(child, paragraph_start, line_starts, body_len, &mut out); + } + out +} + +/// Re-parse `text` as a standalone markdown fragment and collect inline atomic +/// ranges from any paragraphs found inside. +/// Used for paragraphs nested in block quotes: the outer AST's sourcepos values +/// are in body coordinates that drifted out of alignment when +/// `strip_block_quote_markers` removed the per-line `>` prefixes, so the +/// simplest correct thing is to re-parse the stripped text and read sourcepos +/// from that fresh AST, where values are in `text` coordinates by construction. +/// +/// Cost: one extra comrak parse per block-quote-nested paragraph. +/// Block quotes are rare in doc comments and markdown files alike, so this is +/// acceptable. +fn collect_inline_atomic_ranges_from_text(text: &str) -> Vec> { + let arena = Arena::new(); + let options = comrak_options(); + let root = comrak::parse_document(&arena, text, &options); + let line_starts = line_start_offsets(text); + + let mut out = Vec::new(); + walk_paragraphs_for_atomic_ranges(root, &line_starts, text.len(), &mut out); + out +} + +/// Descend the re-parsed AST and collect inline atomic ranges from every +/// paragraph encountered. +/// Mirrors the descend list in `collect_paragraphs` so we don't miss a +/// paragraph nested in a list item or alert inside the stripped block-quote +/// content (e.g. +/// `> - foo. bar.`). +fn walk_paragraphs_for_atomic_ranges<'a>( + node: &'a AstNode<'a>, + line_starts: &[usize], + text_len: usize, + out: &mut Vec>, +) { + let data = node.data(); + if matches!(data.value, NodeValue::Paragraph) { + for child in node.children() { + walk_inline_for_atomic_ranges(child, 0, line_starts, text_len, out); + } + return; + } + for child in node.children() { + walk_paragraphs_for_atomic_ranges(child, line_starts, text_len, out); + } +} + +fn walk_inline_for_atomic_ranges<'a>( + node: &'a AstNode<'a>, + paragraph_start: usize, + line_starts: &[usize], + body_len: usize, + out: &mut Vec>, +) { + let data = node.data(); + let is_atomic = matches!( + data.value, + NodeValue::Emph + | NodeValue::Strong + | NodeValue::Strikethrough + | NodeValue::Code(_) + | NodeValue::Link(_) + | NodeValue::Image(_) + | NodeValue::HtmlInline(_) + | NodeValue::Math(_) + | NodeValue::FootnoteReference(_) + | NodeValue::WikiLink(_) + ); + + if is_atomic { + if let Some(range) = sourcepos_to_byte_range(line_starts, body_len, &data.sourcepos) + && let Some(rel_start) = range.start.checked_sub(paragraph_start) + && let Some(rel_end) = range.end.checked_sub(paragraph_start) + { + out.push(rel_start..rel_end); + } + // Outer span covers any nested inlines; no further recursion. + return; + } + + for child in node.children() { + walk_inline_for_atomic_ranges(child, paragraph_start, line_starts, body_len, out); + } +} + +/// Reflow a single paragraph of prose: semantic line breaks (one sentence per +/// line) plus an optional `max_width` safety net. +/// +/// `max_width == 0` disables width wrapping. +/// With width wrapping on, individual tokens that exceed the width are kept +/// intact rather than split mid-token (URLs, paths, identifiers stay whole). +#[must_use] +pub fn reflow_paragraph( + paragraph: &str, + atomic_ranges: &[Range], + max_width: usize, +) -> String { + let sentences = split_sentences(paragraph, atomic_ranges); + if sentences.is_empty() { + return String::new(); + } + + if max_width == 0 { + return sentences.join("\n"); + } + + let wrap_options = textwrap::Options::new(max_width) + .break_words(false) + .word_splitter(WordSplitter::NoHyphenation); + + sentences + .iter() + .map(|s| textwrap::fill(s, &wrap_options)) + .collect::>() + .join("\n") +} + +struct Replacement { + range: Range, + text: String, +} + +/// Convert a comrak [`Sourcepos`] (1-based line, 1-based byte column, +/// end-inclusive) into a half-open byte range. +/// +/// Returns `None` if the coordinates fall outside `body_len` — a defensive +/// guard against any sourcepos drift we haven't observed but shouldn't rely on +/// the absence of. +/// +/// [`Sourcepos`]: comrak::nodes::Sourcepos +fn sourcepos_to_byte_range( + line_starts: &[usize], + body_len: usize, + sp: &comrak::nodes::Sourcepos, +) -> Option> { + let start_line = sp.start.line.checked_sub(1)?; + let end_line = sp.end.line.checked_sub(1)?; + let start_line_offset = *line_starts.get(start_line)?; + let end_line_offset = *line_starts.get(end_line)?; + + let start = start_line_offset.checked_add(sp.start.column.saturating_sub(1))?; + let end = end_line_offset.checked_add(sp.end.column)?; + + if start > end || end > body_len { + return None; + } + Some(start..end) +} + +/// Byte offset of the first character of each line, with an implicit +/// `line_starts[0] == 0`. +fn line_start_offsets(s: &str) -> Vec { + let mut offsets = vec![0_usize]; + for (i, b) in s.bytes().enumerate() { + if b == b'\n' { + offsets.push(i + 1); + } + } + offsets +} + +#[cfg(test)] +#[path = "format_tests.rs"] +mod tests; diff --git a/crates/contrib/comfort/src/format_tests.rs b/crates/contrib/comfort/src/format_tests.rs new file mode 100644 index 00000000..0c67fa5a --- /dev/null +++ b/crates/contrib/comfort/src/format_tests.rs @@ -0,0 +1,595 @@ +//! Tests for the format pipeline. +//! The engine trait is gone, so these tests exercise the real sentence splitter +//! directly. +//! Output is deterministic and idempotent, so we assert on exact strings where +//! it's useful and on invariants otherwise. + +use indoc::indoc; +use pretty_assertions::assert_eq; + +use super::{format_source, reflow_markdown, reflow_paragraph}; + +// --------------------------------------------------------------------------- +// reflow_paragraph: sentence splitting + width wrapping +// --------------------------------------------------------------------------- + +#[test] +fn paragraph_splits_two_sentences_onto_their_own_lines() { + let out = reflow_paragraph("Hello world. This is a test.", &[], 0); + assert_eq!(out, "Hello world.\nThis is a test."); +} + +#[test] +fn paragraph_single_sentence_returns_single_line() { + let out = reflow_paragraph("Just one sentence.", &[], 0); + assert_eq!(out, "Just one sentence."); +} + +#[test] +fn paragraph_width_wraps_at_word_boundaries() { + let out = reflow_paragraph("alpha beta gamma delta epsilon zeta.", &[], 12); + for line in out.lines() { + assert!(line.len() <= 12, "line exceeded max_width: {line:?}"); + } + assert!(out.lines().count() > 1); +} + +#[test] +fn paragraph_does_not_break_long_unbreakable_tokens() { + // A URL longer than `max_width` must stay on one line rather than be + // split mid-token. + let url = "https://example.com/path/to/very/long/resource"; + let input = format!("Visit {url} for details."); + let out = reflow_paragraph(&input, &[], 10); + // The URL appears intact on some line. + assert!( + out.lines().any(|l| l.contains(url)), + "URL was broken: {out:?}" + ); +} + +#[test] +fn paragraph_idempotent_under_repeated_reflow() { + let input = "First sentence here. Second sentence too. Third for good measure."; + let once = reflow_paragraph(input, &[], 30); + let twice = reflow_paragraph(&once, &[], 30); + assert_eq!(once, twice); +} + +#[test] +fn paragraph_empty_input_returns_empty() { + assert_eq!(reflow_paragraph("", &[], 0), ""); + assert_eq!(reflow_paragraph(" ", &[], 0), ""); +} + +// --------------------------------------------------------------------------- +// reflow_markdown: comrak-driven block awareness +// --------------------------------------------------------------------------- + +#[test] +fn reference_link_definitions_are_preserved_verbatim() { + let body = indoc! {" + [`format`]: super::format + [`extract`]: super::extract + [`engine`]: super::engine + "}; + assert_eq!(reflow_markdown(body, 0), body); + // Idempotent under width-wrapping too. + assert_eq!(reflow_markdown(body, 80), body); +} + +#[test] +fn paragraph_with_trailing_ref_link_defs_reflows_only_the_paragraph() { + let body = indoc! {" + First. Second. + + [foo]: bar + [baz]: qux + "}; + let expected = indoc! {" + First. + Second. + + [foo]: bar + [baz]: qux + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn block_quote_two_sentences_split_to_two_lines() { + let body = indoc! {" + > First sentence. Second sentence. + "}; + let expected = indoc! {" + > First sentence. + > Second sentence. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn block_quote_multi_line_paragraph_is_reflowed_as_one_logical_para() { + // The `>` markers on continuation lines must be stripped before sembr; + // otherwise they leak into sentence content and double on output. + let body = indoc! {" + > First sentence here. This second sentence + > continues onto another line. + "}; + let expected = indoc! {" + > First sentence here. + > This second sentence continues onto another line. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn block_quote_single_sentence_stays_on_one_line() { + let body = indoc! {" + > A B C D E F. + "}; + assert_eq!(reflow_markdown(body, 0), body); +} + +#[test] +fn nested_block_quote_uses_compound_prefix() { + let body = indoc! {" + > > First. Second. + "}; + let expected = indoc! {" + > > First. + > > Second. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn alert_body_reflows_like_block_quote() { + // GFM `> [!NOTE]` admonition. The `[!NOTE]` header line stays put, + // body paragraph is sembr'd with `> ` continuation. + let body = indoc! {" + > [!NOTE] + > First sentence. Second sentence. + "}; + let expected = indoc! {" + > [!NOTE] + > First sentence. + > Second sentence. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn alert_multi_line_body_strips_continuation_markers() { + // The `>` on continuation lines inside the body must be stripped + // before sembr (same logic as plain block quotes). + let body = indoc! {" + > [!WARNING] + > First sentence here. + > Second sentence here. + "}; + let expected = indoc! {" + > [!WARNING] + > First sentence here. + > Second sentence here. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn footnote_definition_continuation_uses_four_space_indent() { + // CommonMark's footnotes extension specifies 4 spaces of continuation + // indent regardless of the label width. Comrak only retains footnote + // definitions in the AST when they're actually referenced, so the test + // includes a reference too. + let body = indoc! {" + See[^note] for details. + + [^note]: First sentence. Second sentence. + "}; + let expected = indoc! {" + See[^note] for details. + + [^note]: First sentence. + Second sentence. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn footnote_definition_long_label_still_four_spaces() { + // Continuation indent is the spec's 4 spaces, *not* aligned with the + // label width. + let body = indoc! {" + Like[^very-long-label] this. + + [^very-long-label]: First. Second. + "}; + let expected = indoc! {" + Like[^very-long-label] this. + + [^very-long-label]: First. + Second. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn orphan_footnote_definitions_are_preserved_verbatim() { + // Comrak silently drops unreferenced footnote definitions from the AST; + // we can't reflow what we can't see, but the source bytes survive + // intact because nothing in the AST triggers a replacement. + let body = indoc! {" + [^orphan]: Some text. More text. + "}; + assert_eq!(reflow_markdown(body, 0), body); +} + +#[test] +fn block_directive_reflows_interior_without_per_line_prefix() { + // `:::name` block directive: like multiline block quote, delimiters + // sit on their own lines and content inside has no per-line prefix. + let body = indoc! {" + :::warning + First sentence. Second sentence. + ::: + "}; + let expected = indoc! {" + :::warning + First sentence. + Second sentence. + ::: + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn multiline_block_quote_reflows_interior_without_per_line_prefix() { + // `>>>` block quote: delimiters are unique to their own lines, the + // content inside is unprefixed. + let body = indoc! {" + >>> + First sentence. Second sentence. + >>> + "}; + let expected = indoc! {" + >>> + First sentence. + Second sentence. + >>> + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn fenced_code_blocks_are_preserved_verbatim() { + let body = indoc! {" + Some prose. + + ```rust + let x = 1; + let y = 2; + ``` + + More prose. + "}; + let out = reflow_markdown(body, 0); + assert!(out.contains("```rust\nlet x = 1;\nlet y = 2;\n```")); +} + +#[test] +fn list_items_each_reflow_independently() { + let body = indoc! {" + - First item. With two sentences. + - Second item. Also two. + "}; + let expected = indoc! {" + - First item. + With two sentences. + - Second item. + Also two. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn ordered_list_item_uses_three_space_continuation() { + let body = indoc! {" + 1. First step. With detail. + 2. Second step. + "}; + let expected = indoc! {" + 1. First step. + With detail. + 2. Second step. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn list_item_continuation_indent_matches_marker_width() { + // `100. ` is a 5-char marker, so continuation lines should be indented + // by 5 spaces. + let body = indoc! {" + 100. A very long item with several sentences. Like this one. + "}; + let expected = indoc! {" + 100. A very long item with several sentences. + Like this one. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn task_item_continuation_aligns_with_text_after_checkbox() { + // `- [ ] ` is 6 chars total: 2 for the bullet marker, 4 for `[X] `. + // Continuation lines should land at column 7. + let body = indoc! {" + - [ ] First task. With more detail. + "}; + let expected = indoc! {" + - [ ] First task. + With more detail. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn checked_task_item_aligns_the_same_as_unchecked() { + let body = indoc! {" + - [x] Done thing. Some explanation. + "}; + let expected = indoc! {" + - [x] Done thing. + Some explanation. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn task_items_in_a_list_each_get_aligned_continuation() { + let body = indoc! {" + - [ ] First. With more. + - [x] Second. With more. + "}; + let expected = indoc! {" + - [ ] First. + With more. + - [x] Second. + With more. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn list_item_in_block_quote_uses_compound_prefix() { + let body = indoc! {" + > - First. Second. + "}; + let expected = indoc! {" + > - First. + > Second. + "}; + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn gfm_pipe_tables_are_preserved_verbatim() { + // Tables are gated on the `table` extension. Without it comrak parses + // each row as a soft-broken paragraph, which sembr would then split + // mid-row. + let body = indoc! {" + Some prose. + + | head | row | + | ---- | --- | + | a | b | + | c | d | + + More prose. Two sentences. + "}; + let expected = indoc! {" + Some prose. + + | head | row | + | ---- | --- | + | a | b | + | c | d | + + More prose. + Two sentences. + "}; + assert_eq!(reflow_markdown(body, 0), expected); + // Idempotent across width-wrapping too. + let once = reflow_markdown(body, 80); + let twice = reflow_markdown(&once, 80); + assert_eq!(once, twice); +} + +#[test] +fn paragraph_with_backslash_hard_break_is_preserved_verbatim() { + // GFM hard break: `\` at end of line. The paragraph stays untouched + // even when it contains content sembr would otherwise split. + let body = "Foo. Bar.\\\nBaz.\n"; + assert_eq!(reflow_markdown(body, 0), body); +} + +#[test] +fn paragraph_with_trailing_spaces_hard_break_is_preserved_verbatim() { + // GFM hard break: two trailing spaces before `\n`. `collapse_whitespace` + // would silently eat the marker, so we opt out of reflow. + let body = concat!("Foo. Bar.", " \n", "Baz.\n"); + assert_eq!(reflow_markdown(body, 0), body); +} + +#[test] +fn hard_break_nested_inside_emphasis_is_preserved() { + // Regression: `has_hard_line_break` used to only check direct paragraph + // children, so a hard break under an `Emph` (or any other inline + // container) was invisible. The emphasis span was then treated as + // atomic and `fold_line_breaks` collapsed the hard break into a space. + let body = concat!("*first.", " \n", "second*\n"); + assert_eq!(reflow_markdown(body, 80), body); +} + +#[test] +fn hard_break_nested_inside_link_text_is_preserved() { + // Same problem class as the emphasis case: a hard break under a Link + // node escapes the direct-children check. + let body = concat!("[first.", " \n", "second](https://example.com)\n"); + assert_eq!(reflow_markdown(body, 80), body); +} + +#[test] +fn backslash_hard_break_nested_inside_emphasis_is_preserved() { + // Backslash form of hard break, nested inside emphasis. Same protection. + let body = "*first.\\\nsecond*\n"; + assert_eq!(reflow_markdown(body, 80), body); +} + +#[test] +fn hard_break_only_skips_its_own_paragraph() { + // A paragraph with a hard break stays verbatim; siblings still reflow. + let body = concat!( + "First sentence. Second sentence.\n", + "\n", + "Has break.", + " \n", + "Stays put.\n", + "\n", + "Third sentence. Fourth sentence.\n", + ); + let expected = concat!( + "First sentence.\nSecond sentence.\n", + "\n", + "Has break.", + " \n", + "Stays put.\n", + "\n", + "Third sentence.\nFourth sentence.\n", + ); + assert_eq!(reflow_markdown(body, 0), expected); +} + +#[test] +fn atx_headings_are_preserved_verbatim() { + let body = indoc! {" + # A Heading + + Some prose. + "}; + let out = reflow_markdown(body, 0); + assert!(out.contains("# A Heading")); + assert!(out.contains("Some prose.")); +} + +#[test] +fn body_with_no_top_level_paragraphs_is_unchanged() { + let body = "[`x`]: y\n"; + assert_eq!(reflow_markdown(body, 0), body); +} + +#[test] +fn empty_body_returns_empty() { + assert_eq!(reflow_markdown("", 0), ""); +} + +// --------------------------------------------------------------------------- +// format_source: full pipeline including extract + reassemble +// --------------------------------------------------------------------------- + +#[test] +fn empty_source_returns_empty() { + assert_eq!(format_source("", 0), ""); +} + +#[test] +fn source_without_doc_comments_is_unchanged() { + let src = indoc! {" + fn main() { + let x = 1; // not a doc comment + println!(\"{x}\"); + } + "}; + assert_eq!(format_source(src, 0), src); +} + +#[test] +fn multiple_blocks_are_all_reformatted() { + let src = indoc! {" + /// First. Second. + fn one() {} + + /// Third. Fourth. + fn two() {} + "}; + let expected = indoc! {" + /// First. + /// Second. + fn one() {} + + /// Third. + /// Fourth. + fn two() {} + "}; + assert_eq!(format_source(src, 0), expected); +} + +#[test] +fn reassembly_uses_block_indent() { + let src = indoc! {" + mod m { + /// Hello. World. + fn f() {} + } + "}; + let expected = indoc! {" + mod m { + /// Hello. + /// World. + fn f() {} + } + "}; + assert_eq!(format_source(src, 0), expected); +} + +#[test] +fn trailing_newline_is_preserved() { + let src = "/// foo bar baz.\nfn f() {}\n"; + let out = format_source(src, 0); + assert!(out.ends_with('\n')); +} + +#[test] +fn surrounding_code_is_preserved_verbatim() { + let src = indoc! {" + use std::io; + + /// Greet. Politely. + pub fn greet() { + // inline comment with weird chars: !@#$% + let s = \"contains /// inside string\"; + println!(\"{s}\"); + } + "}; + let out = format_source(src, 0); + assert!(out.contains("use std::io;")); + assert!(out.contains("// inline comment with weird chars: !@#$%")); + assert!(out.contains("\"contains /// inside string\"")); + assert!(out.contains("/// Greet.")); + assert!(out.contains("/// Politely.")); +} + +#[test] +fn format_source_reflows_paragraphs_but_preserves_ref_link_defs() { + let src = indoc! {" + //! Some prose. Sentence two. + //! + //! [`x`]: y + //! [`z`]: w + fn f() {} + "}; + let out = format_source(src, 100); + assert!(out.contains("//! Some prose.\n//! Sentence two.")); + assert!(out.contains("//! [`x`]: y\n//! [`z`]: w")); +} diff --git a/crates/contrib/comfort/src/lib.rs b/crates/contrib/comfort/src/lib.rs new file mode 100644 index 00000000..b5b098d0 --- /dev/null +++ b/crates/contrib/comfort/src/lib.rs @@ -0,0 +1,126 @@ +//! Semantic line-break formatter for Rust doc comments. +//! +//! `comfort` walks Rust source files, locates outer (`///`) and inner (`//!`) +//! doc-comment blocks, and reflows each block's prose paragraphs with semantic +//! line breaks (one sentence per line) plus an optional `max_width` safety net. +//! +//! Non-doc code, inline `//` comments, and `/** */` block-style doc comments +//! are left untouched. +//! Markdown structure inside doc comments — reference link definitions, block +//! quotes, lists, code blocks, headings, tables — is preserved verbatim; only +//! paragraph contents are reflowed. + +pub mod cli; +pub mod extract; +pub mod format; +pub mod run; +pub mod sentence; +pub mod walk; + +#[cfg(test)] +#[path = "lib_tests.rs"] +mod tests; + +use std::{ + ffi::OsString, + path::{Path, PathBuf}, + process::ExitCode, +}; + +use clap::Parser; + +use crate::cli::{Cli, Invocation}; + +/// Default maximum line width for wrapped doc-comment content. +pub const DEFAULT_MAX_WIDTH: usize = 80; + +/// Shared binary entry-point. +/// Both `comfort` and `cargo-comfort` delegate here; the invocation mode is +/// detected from `argv[0]` at runtime. +/// +/// `eprintln!` is otherwise denied by the workspace lints — allowing it here +/// keeps fatal-error reporting in one place. +#[allow(clippy::print_stderr)] +#[must_use] +pub fn cli_main() -> ExitCode { + let raw: Vec = std::env::args_os().collect(); + let (invocation, args) = parse_invocation(raw); + + let cli = Cli::parse_from(args); + + match run::run(&cli, invocation) { + Ok(()) => ExitCode::SUCCESS, + Err(Error::CheckFailed(_)) => ExitCode::from(1), + Err(err) => { + eprintln!("comfort: {err}"); + ExitCode::from(2) + } + } +} + +/// Identify whether we were invoked directly (`comfort`) or by cargo +/// (`cargo-comfort`). +/// For the cargo case, cargo passes the subcommand name (`comfort`) as +/// `args[1]`, which we strip before handing args to clap. +fn parse_invocation(mut raw: Vec) -> (Invocation, Vec) { + let Some(bin) = raw + .first() + .and_then(|p| Path::new(p).file_name().map(OsString::from)) + else { + return (Invocation::Direct, raw); + }; + + // On Windows the binary name carries `.exe`; match either form. + let is_cargo = bin == *"cargo-comfort" || bin == *"cargo-comfort.exe"; + + if !is_cargo { + return (Invocation::Direct, raw); + } + + // Cargo always passes the subcommand name as args[1]. Skip it if present. + if raw.get(1).is_some_and(|s| s == "comfort") { + raw.remove(1); + } + (Invocation::Cargo, raw) +} + +/// Errors produced by the comfort library. +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("io: {0}")] + Io(#[from] std::io::Error), + + #[error("cargo metadata: {0}")] + CargoMetadata(#[from] cargo_metadata::Error), + + #[error("walk: {0}")] + Walk(#[from] ignore::Error), + + /// Failed to read a source file. + /// Carries the path so the user knows which file failed when walking many + /// at once. + #[error("failed to read {path}: {source}")] + ReadFile { + path: PathBuf, + #[source] + source: std::io::Error, + }, + + /// Failed to write a reformatted file back to disk. + #[error("failed to write {path}: {source}")] + WriteFile { + path: PathBuf, + #[source] + source: std::io::Error, + }, + + /// One of the names passed to `-p`/`--package` or `--exclude` doesn't match + /// any workspace package. + #[error("unknown package: {0}")] + UnknownPackage(String), + + /// Reported in `--check` mode when at least one file would be reformatted. + /// Carries the count of files that differ. + #[error("{0} file(s) would be reformatted")] + CheckFailed(usize), +} diff --git a/crates/contrib/comfort/src/lib_tests.rs b/crates/contrib/comfort/src/lib_tests.rs new file mode 100644 index 00000000..a923a6a4 --- /dev/null +++ b/crates/contrib/comfort/src/lib_tests.rs @@ -0,0 +1,1545 @@ +//! End-to-end tests through the full pipeline (extract + markdown parsing + +//! sentence splitting + width wrapping). +//! These tests assert on invariants the user-visible contract makes — +//! surrounding code preserved, idempotence, markdown blocks unmolested. + +use std::{io, path::PathBuf}; + +use indoc::indoc; +use pretty_assertions::assert_eq; +use unicode_width::UnicodeWidthStr; + +use crate::{ + DEFAULT_MAX_WIDTH, Error, + format::{ + FormatOptions, format_markdown_canonical, format_markdown_with, format_rust_source_with, + format_source, format_source_canonical, reflow_markdown, + }, +}; + +#[test] +fn formatting_is_idempotent() { + let src = indoc! {" + /// First sentence here. Second sentence on the same source line, which + /// should be split by sembr into two separate output lines. + pub fn f() {} + "}; + let once = format_source(src, DEFAULT_MAX_WIDTH); + let twice = format_source(&once, DEFAULT_MAX_WIDTH); + assert_eq!(once, twice, "format_source must be idempotent"); +} + +#[test] +fn surrounding_code_unchanged() { + let src = indoc! {" + use std::io; + + /// Two sentences here. The splitter will split them. + pub fn greet() -> io::Result<()> { + // inline // not a doc + let s = \"contains /// inside string\"; + println!(\"{s}\"); + Ok(()) + } + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("use std::io;")); + assert!(out.contains(" // inline // not a doc")); + assert!(out.contains("\"contains /// inside string\"")); + assert!(out.contains(" println!(\"{s}\");")); + assert!(out.contains(" Ok(())")); +} + +#[test] +fn fenced_code_block_inside_doc_comment_survives() { + let src = indoc! {" + /// Example. + /// + /// ```rust + /// let x = 1; + /// let y = 2; + /// ``` + /// + /// More prose. + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("/// ```rust")); + assert!(out.contains("/// let x = 1;")); + assert!(out.contains("/// let y = 2;")); + assert!(out.contains("/// ```")); + let twice = format_source(&out, DEFAULT_MAX_WIDTH); + assert_eq!(out, twice); +} + +#[test] +fn inner_module_docs_are_handled() { + let src = indoc! {" + //! This module does a thing. It does several things, actually. + + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("//! This module does a thing.")); + assert!(out.contains("//! It does several things, actually.")); +} + +#[test] +fn max_width_accounts_for_indent_and_prefix() { + // The user's exact example: a 4-space-indented `//!` block with + // max_width=10 should fit content within `10 - 4 - 4 = 2` columns. + // Words longer than 2 chars stay intact (NoHyphenation, break_words=false). + let src = indoc! {" + mod m { + //! foo bar + } + "}; + let expected = indoc! {" + mod m { + //! foo + //! bar + } + "}; + assert_eq!(format_source(src, 10), expected); +} + +#[test] +fn long_urls_are_not_broken_under_tight_max_width() { + // The other regression we cared about: max_width small enough to want + // to break a URL, but the URL must stay intact. + let src = indoc! {" + /// See https://example.com/path/to/very/long/resource for details. + pub fn f() {} + "}; + let out = format_source(src, 20); + // The URL is on a line by itself but unbroken. + assert!( + out.contains("https://example.com/path/to/very/long/resource"), + "URL was broken: {out}" + ); +} + +#[test] +fn max_width_zero_disables_width_wrapping() { + let src = indoc! {" + /// One very long sentence with many words that would otherwise wrap. + pub fn f() {} + "}; + let out = format_source(src, 0); + assert!(out.contains("/// One very long sentence with many words that would otherwise wrap.")); +} + +#[test] +fn max_width_smaller_than_prefix_degrades_to_pure_sembr() { + let src = indoc! {" + mod outer { + mod inner { + /// First sentence. Second sentence. + pub fn f() {} + } + } + "}; + let out = format_source(src, 4); + assert!(out.contains("/// First sentence.")); + assert!(out.contains("/// Second sentence.")); +} + +#[test] +fn soft_line_breaks_in_paragraph_source_are_collapsed() { + // Regression: a paragraph spanning multiple source lines (with `///` + // prefixes preserving its layout) must be reflowed as one logical + // paragraph, not as multiple line-broken sentences. + let src = indoc! {" + /// If `forced_tool` is provided, that tool is included even when its + /// `enable()` check returns `false`. This prevents a mismatch between + /// `tool_choice` and the declared tools list. + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + + // The two sentences each occupy a contiguous run of lines, but neither + // mid-sentence `///` line break from the input survives — `This\n` + // followed by `prevents` on the next line was the original bug. + assert!(!out.contains("`false`.\n/// This\n/// prevents")); + assert!(!out.contains("that\n/// tool")); + + // Idempotence: running twice produces the same output. + let twice = format_source(&out, DEFAULT_MAX_WIDTH); + assert_eq!(out, twice); +} + +#[test] +fn reference_link_definitions_survive_end_to_end() { + let src = indoc! {" + //! Module docs. + //! + //! [`format`]: super::format + //! [`extract`]: super::extract + //! [`engine`]: super::engine + + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("//! [`format`]: super::format")); + assert!(out.contains("//! [`extract`]: super::extract")); + assert!(out.contains("//! [`engine`]: super::engine")); + assert!(!out.contains("super::format [")); +} + +#[test] +fn block_quote_round_trips_when_already_sembr() { + // Input is already one sentence per `> ` line — idempotent under + // reflow. + let src = indoc! {" + /// > This is a note. + /// > It spans two lines. + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert_eq!(out, src); +} + +#[test] +fn block_quote_reflows_two_sentences_end_to_end() { + let src = indoc! {" + /// > Two sentences on one line. Like this. + pub fn f() {} + "}; + let expected = indoc! {" + /// > Two sentences on one line. + /// > Like this. + pub fn f() {} + "}; + assert_eq!(format_source(src, DEFAULT_MAX_WIDTH), expected); +} + +#[test] +fn list_items_reflow_with_marker_aligned_continuation_end_to_end() { + let src = indoc! {" + /// - First item with two sentences. Like so. + /// - 100. Outer item. Continues. + pub fn f() {} + "}; + // Bulleted list: 2-space continuation. The `100.` text inside the + // first item is literal (no nested list parsed inside a bullet item + // without proper formatting), so it just becomes prose. + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("/// - First item with two sentences.\n/// Like so.")); +} + +#[test] +fn list_item_inside_block_quote_uses_compound_prefix_end_to_end() { + let src = indoc! {" + /// > - First. Second. + pub fn f() {} + "}; + let expected = indoc! {" + /// > - First. + /// > Second. + pub fn f() {} + "}; + assert_eq!(format_source(src, DEFAULT_MAX_WIDTH), expected); +} + +#[test] +fn list_item_idempotent_end_to_end() { + let src = indoc! {" + /// - First item. + /// With continuation. + /// - Second item. + pub fn f() {} + "}; + let once = format_source(src, DEFAULT_MAX_WIDTH); + let twice = format_source(&once, DEFAULT_MAX_WIDTH); + assert_eq!(once, twice); +} + +#[test] +fn gfm_pipe_table_in_doc_comment_survives_end_to_end() { + let src = indoc! {" + /// Examples. + /// + /// | name | meaning | + /// | ---- | ------- | + /// | foo | a thing | + /// | bar | another | + /// + /// See above. + pub fn f() {} + "}; + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert!(out.contains("/// | name | meaning |")); + assert!(out.contains("/// | ---- | ------- |")); + assert!(out.contains("/// | foo | a thing |")); + assert!(out.contains("/// | bar | another |")); + let twice = format_source(&out, DEFAULT_MAX_WIDTH); + assert_eq!(out, twice); +} + +#[test] +fn backslash_hard_break_in_doc_comment_survives_end_to_end() { + // Address-block-style use of hard breaks: each line is meant to render + // as a forced `
` in rustdoc. + let src = concat!( + "/// Example output:\\\n", + "/// 123 Main St\\\n", + "/// Springfield\n", + "pub fn f() {}\n", + ); + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert_eq!(out, src); +} + +#[test] +fn trailing_spaces_hard_break_in_doc_comment_survives_end_to_end() { + // The trailing-two-spaces hard-break syntax must survive too — it's + // the variant whose marker is invisible in plain text and therefore + // easiest to lose by accident. + let src = concat!( + "/// Note: this works.", + " \n", + "/// More info below.\n", + "pub fn f() {}\n", + ); + let out = format_source(src, DEFAULT_MAX_WIDTH); + assert_eq!(out, src); +} + +#[test] +fn markdown_paragraph_is_reflowed_end_to_end() { + // Treated as a raw markdown file (would be invoked as `comfort foo.md`). + // No `///` prefix; the whole file is markdown. + let src = indoc! {" + # Title + + First sentence here. Second sentence on the same source line. + + > A blockquote. With two sentences. + + - Item one. With detail. + - Item two. + + [^note]: A footnote. With two sentences. + + See[^note] for details. + "}; + let out = reflow_markdown(src, DEFAULT_MAX_WIDTH); + + // Paragraph reflowed. + assert!(out.contains("First sentence here.\nSecond sentence on the same source line.")); + // Blockquote reflowed with `> ` continuation. + assert!(out.contains("> A blockquote.\n> With two sentences.")); + // List item reflowed with 2-space continuation. + assert!(out.contains("- Item one.\n With detail.")); + // Footnote reflowed with 4-space continuation. + assert!(out.contains("[^note]: A footnote.\n With two sentences.")); + // Heading preserved. + assert!(out.contains("# Title")); +} + +#[test] +fn markdown_frontmatter_is_preserved_verbatim() { + // YAML frontmatter at the top of the file must not be reflowed; the + // `title: Foo` line would otherwise look like a one-line paragraph and + // pass through the sentence splitter as content. + let src = indoc! {" + --- + title: Foo + date: 2024-01-01 + tags: + - one + - two + --- + + # Heading + + A paragraph. With two sentences. + "}; + let out = reflow_markdown(src, DEFAULT_MAX_WIDTH); + // Frontmatter survives byte-for-byte. + assert!(out.contains("---\ntitle: Foo\ndate: 2024-01-01")); + assert!(out.contains(" - one\n - two\n---")); + // Paragraph below still reflows. + assert!(out.contains("A paragraph.\nWith two sentences.")); +} + +#[test] +fn list_item_with_bold_lead_in_keeps_bold_intact() { + // Regression: a list item whose first sentence ended inside a `**...**` + // span used to split at the period, leaving the closing `**` on the + // next line. + let src = indoc! {" + - **What every rerank call records.** Provider ID, model name. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("**What every rerank call records.**"), + "bold span was broken: {out}" + ); + // And the closing `**` is on the same line as the opening one. + assert!( + !out.lines().any(|l| l.trim_start().starts_with("**")), + "closing `**` got stranded on its own line: {out}" + ); +} + +#[test] +fn italic_span_with_period_keeps_emphasis_intact() { + // Asterisk italics: `*foo.*` should not split at the inner period. + let src = indoc! {" + *Foo.* Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("*Foo.* Body sentence here."), + "italic span broken: {out}" + ); +} + +#[test] +fn underscore_italic_with_period_keeps_emphasis_intact() { + // Underscore italic. The regex fallback would over-match `snake_case`, + // but the AST knows the right rules. + let src = indoc! {" + _Foo._ Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("_Foo._ Body sentence here."), + "underscore italic broken: {out}" + ); +} + +#[test] +fn underscore_bold_with_period_keeps_emphasis_intact() { + let src = indoc! {" + __Title.__ Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("__Title.__ Body sentence here."), + "underscore bold broken: {out}" + ); +} + +#[test] +fn triple_asterisk_bold_italic_with_period_keeps_emphasis_intact() { + // CommonMark `***foo***` is Strong nested in Emph (or vice versa); + // either way the outer span's AST range covers everything. + let src = indoc! {" + ***Title.*** Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("***Title.*** Body sentence here."), + "triple-asterisk bold-italic broken: {out}" + ); +} + +#[test] +fn italic_inside_block_quote_keeps_emphasis_intact() { + // The block-quote stripping shifts byte offsets, but the re-parse on + // the cleaned text gives us inline sourcepos in the right coordinate + // system. Underscore italics (which the regex fallback can't catch + // without false-matching `snake_case`) survive inside blockquotes too. + let src = indoc! {" + > _Foo._ Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("> _Foo._ Body sentence here."), + "underscore italic broken inside blockquote: {out}" + ); +} + +#[test] +fn snake_case_inside_block_quote_is_not_protected() { + // Inverse of the above: an identifier inside a blockquote must not be + // treated as italic. + let src = indoc! {" + > See foo_bar_baz. Next sentence. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("> See foo_bar_baz.\n> Next sentence."), + "snake_case got mangled inside blockquote: {out}" + ); +} + +#[test] +fn nested_block_quote_emphasis_survives() { + // Two `>` markers stripped, then re-parsed. Emphasis inside survives. + let src = indoc! {" + > > _Foo._ Body sentence here. + "}; + let out = reflow_markdown(src, 80); + assert!( + out.contains("> > _Foo._ Body sentence here."), + "emphasis broken inside nested blockquote: {out}" + ); +} + +#[test] +fn emphasis_spanning_two_source_lines_does_not_over_indent_continuation() { + // Regression: a list item containing an italic that crosses a source + // line boundary used to make the continuation line over-indent by + // four spaces instead of two, because the embedded `\n ` from the + // italic span survived into textwrap's view and the container prefix + // step then doubled the indent. + let body = indoc! {" + - Lead in here. *Italics span across + two source lines*, then more body sentence here. + "}; + let out = reflow_markdown(body, 80); + for line in out.lines() { + // Either column 0 (the list-marker line) or exactly two spaces of + // continuation indent. Four spaces would be the bug. + assert!( + !line.starts_with(" "), + "line over-indented (4 spaces): {line:?}" + ); + } + // And the italic span is now folded onto a single logical sentence + // — no `\n ` survives inside. + assert!( + !out.contains("*Italics span across\n"), + "italic span retained its source-level newline: {out}" + ); +} + +#[test] +fn inline_code_spanning_two_source_lines_does_not_over_indent_continuation() { + // Same as above but for inline code spans (reproduction of the + // `tracing::warn!(...)` case from the original report). + let body = indoc! {" + - Emit `tracing::warn!(\"foo bar baz qux quux corge + grault garply\")` for the legacy field on each launch. + "}; + let out = reflow_markdown(body, 80); + for line in out.lines() { + assert!( + !line.starts_with(" "), + "line over-indented (4 spaces): {line:?}" + ); + } + // The inline code span is folded onto a single line — the source-level + // `\n ` inside it does not survive. + assert!( + !out.contains("corge\n"), + "inline code retained its source-level newline: {out}" + ); +} + +#[test] +fn snake_case_identifier_is_not_treated_as_underscore_italic() { + // The regex `_[^_]+_` would falsely match `_bar_` inside `foo_bar_baz`. + // The AST approach uses CommonMark rules, which require word-boundary + // markers for underscore emphasis, so identifiers survive. + let src = indoc! {" + See foo_bar_baz. Next sentence. + "}; + let out = reflow_markdown(src, 80); + // The identifier survives literally, and the period after it does + // trigger a sembr split. + assert!( + out.contains("See foo_bar_baz.\nNext sentence."), + "snake_case got mangled: {out}" + ); +} + +#[test] +fn markdown_is_idempotent_end_to_end() { + let src = indoc! {" + # Title + + Some prose. Two sentences worth. + + - Item. Continued. + "}; + let once = reflow_markdown(src, DEFAULT_MAX_WIDTH); + let twice = reflow_markdown(&once, DEFAULT_MAX_WIDTH); + assert_eq!(once, twice); +} + +// --------------------------------------------------------------------------- +// `--format-markdown` (canonical) mode +// --------------------------------------------------------------------------- + +#[test] +fn canonical_default_off_preserves_alternate_list_marker_byte_for_byte() { + // Without `--format-markdown`, `*` bullets stay as `*` even when the + // markdown content is otherwise reflowable. Default mode is + // byte-preserving outside paragraphs. + let body = indoc! {" + * First item. + * Second item. + "}; + let out = reflow_markdown(body, 80); + assert!( + out.contains("* First item."), + "default mode rewrote the bullet marker: {out}" + ); +} + +#[test] +fn canonical_mode_normalizes_list_markers_to_dash() { + // With canonical mode on, comrak's formatter applies our `Dash` + // preference. + let body = indoc! {" + * First item. + * Second item. + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("- First item."), + "canonical mode didn't normalize bullet to dash: {out}" + ); + assert!( + !out.contains("* First item."), + "original `*` marker leaked through: {out}" + ); +} + +#[test] +fn canonical_mode_aligns_table_columns() { + // Misaligned source table; canonical mode should pad data cells to + // match the widest cell per column. + let body = indoc! {" + | A | B | + |---|---| + | short | very long content | + | x | y | + "}; + let out = format_markdown_canonical(body, 80); + // Every row's `|` separators should be at consistent column positions. + let table_lines: Vec<&str> = out + .lines() + .filter(|l| l.trim_start().starts_with('|')) + .collect(); + assert!( + table_lines.len() >= 4, + "expected header + separator + 2 data rows, got {} lines", + table_lines.len() + ); + let pipe_positions: Vec> = table_lines + .iter() + .map(|l| { + l.char_indices() + .filter(|(_, c)| *c == '|') + .map(|(i, _)| i) + .collect() + }) + .collect(); + let first = &pipe_positions[0]; + for (i, positions) in pipe_positions.iter().enumerate() { + assert_eq!( + positions, first, + "row {i} pipe positions don't align with header: {table_lines:#?}" + ); + } +} + +#[test] +fn canonical_mode_aligns_with_explicit_alignment_markers() { + // The separator row's colon pattern carries through after alignment. + let body = indoc! {" + | left | center | right | + | :--- | :---: | ---: | + | a | b | c | + "}; + let out = format_markdown_canonical(body, 80); + // Left-aligned column keeps leading `:`, right-aligned trailing `:`, + // center has both. The dashes get padded to match column width. + assert!( + out.contains(":---") && out.contains(":----:") && out.contains("----:"), + "alignment markers lost or mis-shaped: {out}" + ); +} + +#[test] +fn canonical_mode_aligns_table_with_wide_characters() { + // Wide characters (CJK) count as 2 cells per `UnicodeWidthStr`. The + // table should align visually, not by codepoint count. + let body = indoc! {" + | en | jp | + |---|---| + | hi | こんにちは | + | x | y | + "}; + let out = format_markdown_canonical(body, 80); + let table_lines: Vec<&str> = out + .lines() + .filter(|l| l.trim_start().starts_with('|')) + .collect(); + // Pipe positions are computed in BYTE offsets, which won't match for + // multi-byte CJK rows. The correct check is visual: every row, after + // the second `|`, the second cell should be padded to the same display + // width. Approximation: count `|` characters per line — every row + // should have exactly 3 pipes (start, between cols, end). + for line in &table_lines { + let pipe_count = line.chars().filter(|c| *c == '|').count(); + assert_eq!(pipe_count, 3, "row has unexpected pipe count: {line:?}"); + } + // And the CJK row should be padded such that its right edge `|` + // lands at the same DISPLAY column as the other rows. + let display_col_of_last_pipe = |line: &str| -> usize { + let last_pipe_byte = line.rfind('|').unwrap(); + UnicodeWidthStr::width(&line[..last_pipe_byte]) + }; + let first_last = display_col_of_last_pipe(table_lines[0]); + for line in &table_lines[1..] { + assert_eq!( + display_col_of_last_pipe(line), + first_last, + "row's right edge isn't aligned: header={:?} other={:?}", + table_lines[0], + line + ); + } +} + +#[test] +fn canonical_mode_table_alignment_is_idempotent() { + // Aligned table should round-trip unchanged. + let body = indoc! {" + | A | B | + | ----- | ----------------- | + | short | very long content | + | x | y | + "}; + let once = format_markdown_canonical(body, 80); + let twice = format_markdown_canonical(&once, 80); + assert_eq!(once, twice); +} + +#[test] +fn canonical_mode_still_does_sembr_on_paragraphs() { + // After canonicalisation, the sembr pipeline still runs on paragraphs. + let body = indoc! {" + First sentence. Second sentence on the same line. + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("First sentence.\nSecond sentence on the same line."), + "sembr didn't run after canonicalisation: {out}" + ); +} + +#[test] +fn canonical_mode_is_idempotent_end_to_end() { + let body = indoc! {" + # Heading + + First sentence here. Second sentence here. + + * Item one. With more. + * Item two. + + | A | B | + |---|---| + | x | y | + "}; + let once = format_markdown_canonical(body, 80); + let twice = format_markdown_canonical(&once, 80); + assert_eq!(once, twice, "canonical mode must be idempotent"); +} + +#[test] +fn canonical_mode_on_rust_source_normalizes_inside_doc_comments() { + // `format_source_canonical` is the Rust-source entry that runs + // canonical mode per `///` block. + let src = indoc! {" + /// First sentence here. Second sentence here. + /// + /// * Item one. + /// * Item two. + pub fn f() {} + "}; + let out = format_source_canonical(src, 80); + assert!( + out.contains("/// - Item one."), + "list markers not normalised inside doc comment: {out}" + ); + assert!( + out.contains("/// First sentence here.\n/// Second sentence here."), + "sembr didn't run inside doc comment: {out}" + ); + // The surrounding code is byte-preserved as always. + assert!(out.contains("pub fn f() {}")); +} + +#[test] +fn canonical_mode_preserves_doc_comment_scaffolding() { + // Even with canonical mode on, the `///` prefix and indentation come + // straight from the original source. + let src = indoc! {" + mod m { + /// Inner doc. Two sentences. + pub fn f() {} + } + "}; + let out = format_source_canonical(src, 80); + assert!(out.contains(" /// Inner doc.")); + assert!(out.contains(" /// Two sentences.")); +} + +#[test] +fn canonical_mode_preserves_hard_line_breaks() { + // Hard breaks (two trailing spaces or `\\\n`) are semantically distinct + // from soft breaks: they render as `
` rather than a space. The + // canonical pipeline must preserve them through the width=MAX change. + // comrak normalises two-trailing-spaces to backslash form, which is + // semantically equivalent and arguably more readable in source. + let body = "First line. \nSecond line, hard-broken from first.\n"; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("First line.\\\n") || out.contains("First line. \n"), + "hard break lost: {out:?}" + ); +} + +#[test] +fn default_mode_preserves_hard_line_breaks_verbatim() { + // Without --format-markdown, hard breaks pass through byte-for-byte + // (we don't round-trip through comrak's formatter). + let body = "First line. \nSecond line, hard-broken from first.\n"; + let out = reflow_markdown(body, 80); + assert!( + out.contains("First line. \n"), + "two-trailing-spaces hard break form not preserved verbatim: {out:?}" + ); +} + +#[test] +fn canonical_mode_does_not_escape_digit_period_in_continuation_lines() { + // Regression: with `render.width = 0`, comrak's `format_commonmark` + // preserves source soft breaks and defensively escapes `N.` sequences + // (e.g. `404\.`) that land at the start of continuation lines, on the + // theory that they could be interpreted as ordered-list markers on + // re-parse. The fix is `render.width = usize::MAX`, which collapses + // soft breaks so digit-period sequences end up mid-line. + let body = indoc! {" + Each model is loaded at startup; requests for unloaded models return HTTP + 404. + Apply sigmoid normalization next. + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("404."), + "output missing literal `404.`: {out:?}" + ); + assert!( + !out.contains(r"404\."), + "output has defensive escape `404\\.`: {out:?}" + ); +} + +#[test] +fn canonical_mode_preserves_rust_intra_doc_shortcut_references() { + // Regression: `[`format_source`]` and similar shortcut references + // (no `[label]: url` definition in the body) used to be escaped as + // `[`format_source`]` by comrak's defensive escape logic, because + // the parser treated them as plain bracketed text. The fix uses a + // narrow `broken_link_callback` in `protect_reference_form_links` + // that resolves intra-doc-like labels to `Link` nodes, so the + // protection step can sentinelise their source bytes. + let body = indoc! {" + 1. [`format_source`] finds `///` blocks via + [`find_blocks`] and splices bodies back. + 2. [`reflow_markdown`] parses each block's body. + "}; + let out = format_markdown_canonical(body, 80); + for needle in [ + "[`format_source`]", + "[`find_blocks`]", + "[`reflow_markdown`]", + ] { + assert!( + out.contains(needle), + "intra-doc reference {needle:?} missing from output: {out:?}" + ); + } + assert!( + !out.contains(r"\["), + "defensive bracket escape leaked into output: {out:?}" + ); +} + +#[test] +fn intra_doc_callback_does_not_break_task_items() { + // The `broken_link_callback` would gobble `[ ]` task markers if it + // returned `Some` for them. Narrow filter (empty / `x` / `X` labels) + // returns `None`, letting the tasklist extension recognise them. + let body = indoc! {" + - [ ] First task. With more detail. + "}; + let out = reflow_markdown(body, 0); + assert!( + out.contains("- [ ] First task."), + "task marker lost: {out:?}" + ); + // Continuation indent should be 6 spaces (2 for list padding + 4 for + // task item), confirming the parser still recognised the task item. + assert!( + out.contains("\n With more detail."), + "task item continuation indent wrong: {out:?}" + ); +} + +#[test] +fn intra_doc_callback_does_not_break_footnotes() { + // The `broken_link_callback` would gobble `[^note]` references if it + // returned `Some` for them. Narrow filter (`^...` labels) returns + // `None`, letting the footnotes extension recognise them. + let body = indoc! {" + See[^note] for details. + + [^note]: First sentence. Second sentence. + "}; + let out = format_markdown_canonical(body, 0); + // The reference in prose stays as `[^note]` — not the defensive + // `\[^note\]` we'd see if the parser failed to recognise it as a + // footnote reference. + assert!( + out.contains("See[^note] for details."), + "footnote reference got escaped: {out:?}" + ); + assert!( + !out.contains(r"\[^note\]"), + "defensive escape leaked into footnote reference: {out:?}" + ); + // The definition survives the canonical pass (comrak may reshape it + // — e.g. put the label on its own line — but the content stays). + assert!( + out.contains("[^note]:"), + "footnote definition disappeared: {out:?}" + ); + assert!( + out.contains("First sentence.") && out.contains("Second sentence."), + "footnote definition content lost: {out:?}" + ); +} + +#[test] +fn markdown_pipeline_preserves_exact_trailing_newline_count() { + // Regression: conform.nvim's `injected` formatter extracts the markdown + // body of Rust doc comments and runs comfort as the markdown formatter + // on it. The body ending in `\n\n` corresponds to a trailing empty + // `///` line in the source. If we collapse `\n\n` to `\n`, the empty + // `///` is silently lost on every save. + let body_two_newlines = "Some prose.\n\n[link]: https://example.com\n\n"; + let out = format_markdown_canonical(body_two_newlines, 80); + assert!( + out.ends_with("\n\n"), + "trailing newline count not preserved: {out:?}" + ); + + let body_three_newlines = "Some prose.\n\n\n"; + let out = format_markdown_canonical(body_three_newlines, 80); + assert!( + out.ends_with("\n\n\n"), + "trailing newline count not preserved: {out:?}" + ); +} + +#[test] +fn canonical_mode_preserves_trailing_newline_of_input() { + // Markdown files: keep trailing newline. Doc-comment bodies: don't add + // one. + let with_newline = "Some prose.\n"; + let out = format_markdown_canonical(with_newline, 80); + assert!(out.ends_with('\n'), "trailing newline dropped: {out:?}"); + + let without_newline = "Some prose."; + let out = format_markdown_canonical(without_newline, 80); + assert!(!out.ends_with('\n'), "trailing newline added: {out:?}"); +} + +// --------------------------------------------------------------------------- +// Reference-form link protection across canonical (`--format-markdown`) +// --------------------------------------------------------------------------- + +#[test] +fn canonical_preserves_full_form_reference_link() { + // Regression: comrak's `format_commonmark` would otherwise inline + // `[text][label]` as `[text](url)` and drop the definition. + let body = indoc! {" + See [`foo`][foo-impl] for more. + + [foo-impl]: ../../crates/foo.rs + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("[`foo`][foo-impl]"), + "full-form reference link was inlined: {out}" + ); + assert!( + out.contains("[foo-impl]: ../../crates/foo.rs"), + "reference definition was dropped: {out}" + ); +} + +#[test] +fn canonical_preserves_shortcut_form_reference_link() { + let body = indoc! {" + See [foo] for more. + + [foo]: https://example.com + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("See [foo] for more."), + "shortcut form not preserved: {out}" + ); + assert!( + out.contains("[foo]: https://example.com"), + "definition dropped: {out}" + ); +} + +#[test] +fn canonical_still_inlines_actual_inline_links() { + // Sanity: inline links are NOT protected (the user wrote them inline, + // they stay inline). This is a guard against the protection logic + // accidentally over-firing. + let body = indoc! {" + See [docs](https://example.com) for more. + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("[docs](https://example.com)"), + "inline link got converted to reference: {out}" + ); +} + +#[test] +fn canonical_handles_mixed_inline_and_reference_links() { + let body = indoc! {" + See [docs](https://example.com) and [`foo`][foo-impl] for more. + + [foo-impl]: ../../crates/foo.rs + "}; + let out = format_markdown_canonical(body, 80); + assert!( + out.contains("[docs](https://example.com)"), + "inline link mangled: {out}" + ); + assert!( + out.contains("[`foo`][foo-impl]"), + "reference-form link inlined: {out}" + ); + assert!( + out.contains("[foo-impl]: ../../crates/foo.rs"), + "definition dropped: {out}" + ); +} + +#[test] +fn canonical_preserves_user_chosen_labels_with_reference_links_flag() { + // The original bug: with BOTH `--format-markdown` and `--reference-links`, + // the user's chosen short labels were destroyed. Verify they survive now. + let body = indoc! {" + See [`verify_file_checksum`][verify-impl] for the impl. + + [verify-impl]: ../../crates/jp_mcp/src/client.rs + "}; + let opts = FormatOptions { + max_width: 80, + canonical: true, + reference_links: true, + }; + let out = format_markdown_with(body, &opts); + assert!( + out.contains("[verify-impl]: ../../crates/jp_mcp/src/client.rs"), + "user's `verify-impl` label was rewritten: {out}" + ); + assert!( + out.contains("[`verify_file_checksum`][verify-impl]"), + "reference form not preserved: {out}" + ); +} + +#[test] +fn canonical_protection_only_affects_resolved_reference_links() { + // Bare `[brackets]` with no matching definition aren't reference-form + // links — comrak doesn't parse them as Link nodes — so our protection + // doesn't touch them. Comrak itself escapes the brackets during + // serialisation to disambiguate (a behaviour of `format_commonmark`, + // not our protection), so the output has `\[...\]`. The point of this + // test is the negative: our protection didn't spuriously stash these. + let body = indoc! {" + Use [square brackets] in prose freely. + "}; + let out = format_markdown_canonical(body, 80); + // No sentinel marker leaked into the output (would start with `XCMFRTLR`). + assert!( + !out.contains("XCMFRTLR"), + "sentinel leaked into output: {out}" + ); + // The visible text "square brackets" survives in some form. + assert!( + out.contains("square brackets"), + "prose content disappeared: {out}" + ); +} + +#[test] +fn canonical_protection_ignores_definitions_inside_code_fences() { + let body = indoc! {" + Real link: [foo]. + + ``` + [example]: not-a-real-def + ``` + + [foo]: https://example.com + "}; + let out = format_markdown_canonical(body, 80); + // The fake def inside the fence stays in the fence. + let fence_close = out.rfind("```").unwrap(); + let example_pos = out.find("[example]: not-a-real-def").unwrap(); + assert!( + example_pos < fence_close, + "fake def was extracted out of the fence: {out}" + ); + // The real def survives. + assert!( + out.contains("[foo]: https://example.com"), + "real definition dropped: {out}" + ); +} + +// --------------------------------------------------------------------------- +// `--reference-links` (reference-link extraction) mode +// --------------------------------------------------------------------------- + +fn ref_opts(max_width: usize) -> FormatOptions { + FormatOptions { + max_width, + canonical: false, + reference_links: true, + } +} + +#[test] +fn reference_links_default_off_preserves_inline_links() { + // Without `--reference-links`, inline links pass through unchanged. + let body = indoc! {" + See [docs](https://example.com) for more. + "}; + let out = reflow_markdown(body, 80); + assert!( + out.contains("[docs](https://example.com)"), + "default mode rewrote the inline link: {out}" + ); +} + +#[test] +fn reference_links_converts_inline_to_shortcut_form() { + let body = indoc! {" + See [docs](https://example.com) for more. + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("See [docs] for more."), + "inline link not converted to shortcut form: {out}" + ); + assert!( + out.contains("[docs]: https://example.com"), + "reference definition not appended: {out}" + ); +} + +#[test] +fn reference_links_dedupes_same_url() { + // Same URL referenced twice with different text: second link uses full + // form referring back to the first's canonical label — only one + // definition is emitted. + let body = indoc! {" + See [docs](https://example.com) and [more docs](https://example.com). + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("[docs]") && out.contains("[more docs][docs]"), + "same-URL collision not handled with full-form fallback: {out}" + ); + assert_eq!( + out.matches("[docs]: https://example.com").count(), + 1, + "shared URL got more than one definition: {out}" + ); +} + +#[test] +fn reference_links_disambiguates_same_text_different_url() { + // Same text, different URLs: second link gets a suffixed label. + let body = indoc! {" + See [docs](https://example.com) and [docs](https://other.com). + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("[docs]: https://example.com"), + "first definition missing: {out}" + ); + assert!( + out.contains("[docs-2]: https://other.com"), + "disambiguated definition missing: {out}" + ); + assert!( + out.contains("[docs][docs-2]"), + "second link not in full form: {out}" + ); +} + +#[test] +fn reference_links_skips_anchor_links() { + let body = indoc! {" + See [section](#foo) for more. + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("[section](#foo)"), + "anchor link should not be converted: {out}" + ); +} + +#[test] +fn reference_links_skips_image_links() { + let body = indoc! {" + See ![diagram](https://example.com/d.png) below. + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("![diagram](https://example.com/d.png)"), + "image link should not be converted: {out}" + ); +} + +#[test] +fn reference_links_aggregates_pre_existing_definitions() { + // Pre-existing scattered definitions should also move to the bottom + // and sort alphabetically with the newly converted ones. + let body = indoc! {" + See [zebra] and [alpha](https://alpha.example). + + [zebra]: https://zebra.example + "}; + let out = format_markdown_with(body, &ref_opts(80)); + // Both definitions should be at the bottom, in alphabetical order. + let alpha_pos = out.find("[alpha]: https://alpha.example").unwrap(); + let zebra_pos = out.find("[zebra]: https://zebra.example").unwrap(); + assert!( + alpha_pos < zebra_pos, + "definitions not sorted alphabetically: {out}" + ); +} + +#[test] +fn reference_links_preserves_inline_code_with_link_syntax() { + // Inline code containing `[link](url)` syntax must NOT be converted. + let body = indoc! {" + Use the syntax `[text](url)` to write links. + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("`[text](url)`"), + "inline code with link syntax got mangled: {out}" + ); + assert!( + !out.contains("[text]: url"), + "link inside inline code spuriously generated a definition: {out}" + ); +} + +#[test] +fn reference_links_is_idempotent() { + let body = indoc! {" + See [docs](https://example.com) and [Rust](https://rust-lang.org). + "}; + let once = format_markdown_with(body, &ref_opts(80)); + let twice = format_markdown_with(&once, &ref_opts(80)); + assert_eq!(once, twice, "reference-link mode must be idempotent"); +} + +#[test] +fn reference_links_works_with_rust_doc_comments() { + // The original motivating example from the user. + let src = indoc! {" + /// Source language to format. + /// With [`Auto`](Language::Auto), per-file detection (extension or + /// `--stdin-filename`) determines the format. + pub fn f() {} + "}; + let out = format_rust_source_with(src, &ref_opts(80)); + assert!( + out.contains("/// With [`Auto`],"), + "link not converted in doc comment: {out}" + ); + assert!( + out.contains("/// [`Auto`]: Language::Auto"), + "reference definition not at bottom of doc comment: {out}" + ); + assert!(out.contains("pub fn f() {}")); +} + +#[test] +fn reference_links_composes_with_canonical_mode() { + // Both flags enabled: canonical pass runs first (normalising structure), + // then reference-link extraction. Both transformations should apply. + let body = indoc! {" + * See [docs](https://example.com). + * Another [item](https://other.com). + "}; + let opts = FormatOptions { + max_width: 80, + canonical: true, + reference_links: true, + }; + let out = format_markdown_with(body, &opts); + // Canonical: `*` → `-`. + assert!( + out.contains("- See [docs]"), + "canonical pass didn't normalise list marker: {out}" + ); + // Reference: definitions at the bottom. + assert!( + out.contains("[docs]: https://example.com") && out.contains("[item]: https://other.com"), + "reference-link pass didn't run: {out}" + ); +} + +#[test] +fn reference_links_preserves_inline_link_title() { + // Regression: `[docs](url "Title")` used to round-trip as + // `[docs] + [docs]: url`, silently dropping the title metadata. + let body = indoc! {r#" + See [docs](https://example.com "Docs Title") for more. + "#}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("See [docs] for more."), + "inline link not converted to shortcut form: {out}" + ); + assert!( + out.contains(r#"[docs]: https://example.com "Docs Title""#), + "reference definition lost its title: {out}" + ); +} + +#[test] +fn reference_links_disambiguates_same_url_with_different_titles() { + // Two links pointing at the same URL but carrying different titles + // must get distinct definitions — otherwise the title of one is + // silently dropped during dedup. + let body = indoc! {r#" + See [primary](https://example.com "Primary view") and + [alternate](https://example.com "Alternate view"). + "#}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains(r#"[primary]: https://example.com "Primary view""#), + "first definition missing or titleless: {out}" + ); + assert!( + out.contains(r#"[alternate]: https://example.com "Alternate view""#), + "second definition missing or titleless: {out}" + ); + // Both link sites should use shortcut form (each label was free). + assert!( + out.contains("[primary]") && out.contains("[alternate]"), + "link sites didn't pick up their reference forms: {out}" + ); +} + +#[test] +fn reference_links_dedupes_same_url_same_title() { + // Same URL AND same title: a single definition, both link sites point + // at the same canonical label (full form for the second to preserve + // its different link text). + let body = indoc! {r#" + See [docs](https://example.com "Docs") and + [more docs](https://example.com "Docs"). + "#}; + let out = format_markdown_with(body, &ref_opts(80)); + assert_eq!( + out.matches(r#"[docs]: https://example.com "Docs""#).count(), + 1, + "shared (url, title) got more than one definition: {out}" + ); + assert!( + out.contains("[more docs][docs]"), + "second link not in full-form referring back to the first: {out}" + ); +} + +#[test] +fn reference_links_preserves_existing_definition_with_title() { + // A pre-existing scattered `[foo]: url "title"` definition must come + // out the other end with its title intact (and moved to the bottom). + let body = indoc! {r#" + See [foo] for more. + + [foo]: https://example.com "Foo title" + "#}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains(r#"[foo]: https://example.com "Foo title""#), + "existing definition lost its title: {out}" + ); +} + +#[test] +fn reference_links_with_titles_is_idempotent() { + let body = indoc! {r#" + See [docs](https://example.com "D") and [other](https://other.com "O"). + "#}; + let once = format_markdown_with(body, &ref_opts(80)); + let twice = format_markdown_with(&once, &ref_opts(80)); + assert_eq!( + once, twice, + "reference-link mode with titles must be idempotent" + ); +} + +#[test] +fn reference_links_handles_case_insensitive_label_collisions() { + // Regression: CommonMark reference labels match case-insensitively + // (§4.7). An existing `[Foo]: /old` must collide with an inline + // `[foo](/new)` even though the raw strings differ in case — + // otherwise we'd emit two definitions with the same canonical label + // and the renderer would resolve the converted shortcut to whichever + // came first. + let body = indoc! {" + See [Foo] and [foo](/new). + + [Foo]: /old + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("[Foo]: /old"), + "existing definition lost: {out}" + ); + assert!( + out.contains("[foo-2]: /new"), + "disambiguated definition for new URL missing: {out}" + ); + assert!( + out.contains("[foo][foo-2]"), + "new link doesn't reference the disambiguated label: {out}" + ); +} + +#[test] +fn reference_links_handles_whitespace_normalized_label_collisions() { + // CommonMark §4.7 normalises internal whitespace too: `[foo bar]` and + // `[Foo Bar]` are the same label. + let body = indoc! {" + See [Foo Bar] and [foo bar](/new). + + [Foo Bar]: /old + "}; + let out = format_markdown_with(body, &ref_opts(80)); + // The new link's URL is different, so it must get a disambiguated + // label even though `foo bar` looks free to a raw-string lookup. + assert!( + out.contains("[foo bar-2]: /new"), + "whitespace-collision not disambiguated: {out}" + ); +} + +#[test] +fn reference_links_does_not_extract_def_that_interrupts_paragraph() { + // Regression: CommonMark forbids reference definitions from + // interrupting a paragraph. `Foo\n[bar]: /baz` is one paragraph, and + // the `[bar]: /baz` line is visible prose — not a definition. The + // line-shape extractor used to take it out anyway and re-emit it + // below, silently changing rendered content. + let body = "Foo\n[bar]: /baz\n"; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + !out.contains("\n\n[bar]: /baz"), + "in-paragraph ref-def shape was extracted to a separate block: {out:?}" + ); + assert!( + out.contains("[bar]: /baz"), + "the [bar]: /baz text disappeared from the output: {out:?}" + ); +} + +#[test] +fn reference_links_still_extracts_legitimately_separated_definitions() { + // Canary for the fix above: a definition that's NOT inside a paragraph + // (separated by a blank line) must still be extracted and consolidated + // at the bottom. The paragraph-protection rule has to be specific + // enough not to swallow this case. + let body = indoc! {" + Some prose. + + [foo]: /bar + "}; + let out = format_markdown_with(body, &ref_opts(80)); + assert!( + out.contains("[foo]: /bar"), + "legitimate ref-def lost: {out}" + ); +} + +#[test] +fn reference_links_skips_definitions_inside_fenced_code() { + // A `[label]: url` line inside a fenced code block must NOT be treated + // as a reference definition (it's literal example text). + let body = indoc! {" + Real link: [docs](https://example.com). + + ``` + [example]: https://not-a-real-def.com + ``` + "}; + let out = format_markdown_with(body, &ref_opts(80)); + // The fake def inside the fence should stay where it is. + assert!( + out.contains("[example]: https://not-a-real-def.com"), + "fake definition inside fence got extracted: {out}" + ); + // It should appear inside the fence, not at the bottom. + let example_pos = out.find("[example]: https://not-a-real-def.com").unwrap(); + let fence_close = out.rfind("```").unwrap(); + assert!( + example_pos < fence_close, + "fake definition extracted out of fence: {out}" + ); +} + +#[test] +fn no_doc_comments_means_byte_identical_output() { + let src = indoc! {" + fn main() { + // ordinary comment + let x = 42; + println!(\"{x}\"); + } + "}; + assert_eq!(format_source(src, DEFAULT_MAX_WIDTH), src); +} + +#[test] +fn read_file_error_carries_the_path() { + let err = Error::ReadFile { + path: PathBuf::from("/tmp/nope.rs"), + source: io::Error::new(io::ErrorKind::PermissionDenied, "denied"), + }; + let msg = err.to_string(); + assert!(msg.contains("/tmp/nope.rs"), "missing path: {msg}"); + assert!(msg.contains("denied"), "missing source: {msg}"); +} + +#[test] +fn write_file_error_carries_the_path() { + let err = Error::WriteFile { + path: PathBuf::from("/tmp/nope.rs"), + source: io::Error::new(io::ErrorKind::PermissionDenied, "denied"), + }; + let msg = err.to_string(); + assert!(msg.contains("/tmp/nope.rs"), "missing path: {msg}"); + assert!(msg.contains("denied"), "missing source: {msg}"); +} diff --git a/crates/contrib/comfort/src/run.rs b/crates/contrib/comfort/src/run.rs new file mode 100644 index 00000000..1950b390 --- /dev/null +++ b/crates/contrib/comfort/src/run.rs @@ -0,0 +1,188 @@ +//! Orchestration layer: parses CLI intent, walks filesystem, dispatches to the +//! pure format pipeline, handles `--check` diffing and exit codes. +//! +//! This is the imperative shell. +//! The functional core lives in [`format`] and [`extract`]. +//! +//! [`extract`]: super::extract +//! [`format`]: super::format + +use std::{ + io::{self, IsTerminal, Read, Write}, + path::{Path, PathBuf}, +}; + +use similar::{ChangeTag, TextDiff}; + +use crate::{ + Error, + cli::{Cli, Format, Invocation, Language}, + format::{FormatOptions, format_markdown_with, format_rust_source_with}, + walk::{expand_path, workspace_files}, +}; + +/// Top-level entry point. +/// Returns an [`Error`] for I/O failures; returns [`Error::CheckFailed`] when +/// `--check` finds drift. +pub fn run(cli: &Cli, invocation: Invocation) -> Result<(), Error> { + // Source selection. The intent ladder: + // 1. Workspace mode (explicit `--workspace`, or `-p`/`--exclude` + // restricting which packages to walk). + // 2. Explicit paths process those paths. + // 3. No paths + cargo invocation: workspace (all packages). + // 4. No paths + direct invocation: stdin/stdout. + let opts = FormatOptions { + max_width: cli.max_width, + canonical: cli.format_markdown, + reference_links: cli.reference_links, + }; + let workspace_mode = cli.workspace || !cli.packages.is_empty() || !cli.exclude.is_empty(); + if workspace_mode { + let files = workspace_files(&cli.packages, &cli.exclude, cli.language)?; + return run_files(files, cli.language, cli.check, cli.list_changed, &opts); + } + if !cli.paths.is_empty() { + let mut files = Vec::new(); + for path in &cli.paths { + files.extend(expand_path(path, cli.language)?); + } + return run_files(files, cli.language, cli.check, cli.list_changed, &opts); + } + if invocation == Invocation::Cargo { + let files = workspace_files(&[], &[], cli.language)?; + return run_files(files, cli.language, cli.check, cli.list_changed, &opts); + } + + // Default for direct invocation: stdin → stdout (or stdin → check-diff). + if io::stdin().is_terminal() { + let mut stderr = io::stderr().lock(); + writeln!( + stderr, + "comfort: no input. Pass paths, use --workspace, or pipe source on stdin." + )?; + return Ok(()); + } + run_stdin( + cli.language, + cli.check, + cli.list_changed, + cli.stdin_filename.as_deref(), + &opts, + ) +} + +fn run_stdin( + language: Language, + check: bool, + list_changed: bool, + stdin_filename: Option<&Path>, + opts: &FormatOptions, +) -> Result<(), Error> { + let mut buf = String::new(); + io::stdin().read_to_string(&mut buf)?; + + let format = language.resolve(stdin_filename); + let formatted = format_for(&buf, format, opts); + + let label = stdin_filename.unwrap_or(Path::new("")); + + if check { + if formatted != buf { + if list_changed { + writeln!(io::stdout().lock(), "{}", label.display())?; + } else { + print_diff(label, &buf, &formatted)?; + } + return Err(Error::CheckFailed(1)); + } + return Ok(()); + } + + // Write mode + `--list-changed`: announce the label on stderr so it + // doesn't corrupt the formatted-content stream on stdout. (In check + // mode there's no payload on stdout, so the label goes there to match + // the file-walk path.) + if list_changed && formatted != buf { + writeln!(io::stderr().lock(), "{}", label.display())?; + } + + let mut stdout = io::stdout().lock(); + stdout.write_all(formatted.as_bytes())?; + Ok(()) +} + +fn run_files( + files: Vec, + language: Language, + check: bool, + list_changed: bool, + opts: &FormatOptions, +) -> Result<(), Error> { + let mut changed = 0_usize; + let mut stdout = io::stdout().lock(); + + for path in files { + let source = std::fs::read_to_string(&path).map_err(|source| Error::ReadFile { + path: path.clone(), + source, + })?; + let format = language.resolve(Some(&path)); + let formatted = format_for(&source, format, opts); + if formatted == source { + continue; + } + + changed += 1; + if list_changed { + writeln!(stdout, "{}", path.display())?; + } else if check { + print_diff(&path, &source, &formatted)?; + } + if !check { + std::fs::write(&path, formatted).map_err(|source| Error::WriteFile { + path: path.clone(), + source, + })?; + } + } + + if check && changed > 0 { + return Err(Error::CheckFailed(changed)); + } + Ok(()) +} + +/// Dispatch to the right pipeline for the resolved format. +/// Both optional transformations (`--format-markdown` for structural +/// canonicalisation, `--reference-links` for link extraction) compose +/// orthogonally on top of the always-on sembr reflow. +fn format_for(source: &str, format: Format, opts: &FormatOptions) -> String { + match format { + Format::Rust => format_rust_source_with(source, opts), + Format::Markdown => format_markdown_with(source, opts), + } +} + +fn print_diff(label: &Path, old: &str, new: &str) -> Result<(), io::Error> { + let diff = TextDiff::from_lines(old, new); + let mut out = io::stdout().lock(); + + writeln!(out, "--- {}", label.display())?; + writeln!(out, "+++ {} (formatted)", label.display())?; + + for hunk in diff.unified_diff().iter_hunks() { + writeln!(out, "{}", hunk.header())?; + for change in hunk.iter_changes() { + let sigil = match change.tag() { + ChangeTag::Delete => '-', + ChangeTag::Insert => '+', + ChangeTag::Equal => ' ', + }; + write!(out, "{sigil}{}", change.value())?; + if !change.value().ends_with('\n') { + writeln!(out)?; + } + } + } + Ok(()) +} diff --git a/crates/contrib/comfort/src/sentence.rs b/crates/contrib/comfort/src/sentence.rs new file mode 100644 index 00000000..464701ab --- /dev/null +++ b/crates/contrib/comfort/src/sentence.rs @@ -0,0 +1,302 @@ +//! Sentence segmentation with abbreviation-aware merging. +//! +//! Adapted from snapper-fmt (), +//! MIT-licensed, Copyright (c) 2026 Rohit Goswami. +//! +//! Reduced to the English-only subset comfort actually needs and inlined to +//! avoid the upstream dependency. +//! Logic is otherwise unchanged: protect inline tokens (URLs, code spans, +//! links) with placeholders, run UAX \#29 sentence segmentation, then merge +//! false splits caused by abbreviations and quoted punctuation. + +use std::{ops::Range, sync::LazyLock}; + +use regex::Regex; +use unicode_segmentation::UnicodeSegmentation; + +/// English abbreviations whose trailing period must not be treated as a +/// sentence boundary. +/// Kept short and code-comment-focused. +static EN_ABBREVIATIONS: &[&str] = &[ + // Titles + "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St", "Rev", "Gen", "Gov", "Sgt", "Cpl", "Pvt", + "Capt", "Lt", "Col", "Maj", "Cmdr", "Adm", // Academic / scientific + "Fig", "Figs", "Eq", "Eqs", "Ref", "Refs", "Tab", "Sec", "Ch", "Vol", "No", "Nos", "Ed", "Eds", + "Trans", "Dept", "Thm", "Lem", "Prop", "Def", "Cor", "Rem", "Ex", // Latin + "al", "approx", "ca", "cf", "etc", "et", "ibid", "viz", // Common + "vs", "misc", "est", "govt", "dept", "univ", "inc", "corp", "ltd", "Ave", "Blvd", "Rd", "Jan", + "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Mon", "Tue", "Wed", + "Thu", "Fri", "Sat", "Sun", "pp", "pg", "pt", "pts", // Single letters (initials) + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", + "T", "U", "V", "W", "X", "Y", "Z", +]; + +/// Multi-word abbreviations where the period falls inside, e.g. `e.g.`, `i.e.`, +/// `a.m.`, `p.m.`, `v.s.`. +static EN_MULTI_ABBREVS: &[&str] = &["e.g", "i.e", "a.m", "p.m", "v.s"]; + +/// Inline tokens that must not be broken across sentences. +/// Replaced with placeholders before segmentation, restored after. +static INLINE_TOKEN_RE: LazyLock = LazyLock::new(|| { + Regex::new( + &[ + r"\[\[[^\]]*\]\]", // Org links: [[url]] + r"\[\[[^\]]*\]\[[^\]]*\]\]", // Org links with description + r"\[[^\]]+\]\([^)]+\)", // Markdown inline links + r"!\[[^\]]*\]\([^)]+\)", // Markdown images + r"\$[^$]+\$", // Inline math + r"\\([a-zA-Z]+)\{[^}]*\}", // LaTeX commands + r"~[^~]+~", // Org inline code + r"=[^=]+=", // Org verbatim + r"`[^`]+`", // Markdown inline code + r"\*\*[^*]+\*\*", // Markdown bold: **text** + r"~~[^~]+~~", // Markdown strikethrough: ~~text~~ + r#"https?://\S+[^.\s!?,;:)\]'""]"#, // URLs (don't swallow trailing punct) + r"file:\S+", // file:// links + ] + .join("|"), + ) + .expect("valid inline-token regex") +}); + +/// Punctuation followed by closing quote/paren at the end of a segment. +/// Used to detect false splits like `He said "wow!"` + `and left.`. +static QUOTED_PUNCT_END_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"[.!?]["')\]]+\s*$"#).expect("valid quoted-punct regex")); + +/// Compiled regex matching a single-token abbreviation immediately before a +/// trailing period. +/// Anchored to end of segment. +static ABBREV_RE: LazyLock = LazyLock::new(|| { + let alts = EN_ABBREVIATIONS.join("|"); + let pattern = format!(r#"(?:^|[\s"'`(\[])(?:{alts})$"#); + Regex::new(&pattern).expect("valid abbreviation regex") +}); + +/// Compiled regex matching a multi-word abbreviation immediately before a +/// trailing period. +static MULTI_ABBREV_RE: LazyLock = LazyLock::new(|| { + let alts: Vec = EN_MULTI_ABBREVS.iter().map(|a| regex::escape(a)).collect(); + let pattern = format!(r"(?:^|\s)(?:{})$", alts.join("|")); + Regex::new(&pattern).expect("valid multi-abbreviation regex") +}); + +/// Split a prose paragraph into individual sentences, respecting common +/// abbreviations and inline-token boundaries. +/// +/// `atomic_ranges` are byte ranges in `text` that must be treated as +/// indivisible by sentence segmentation: typically markdown inline spans +/// (`Emph`, `Strong`, `Strikethrough`, `Code`, `Link`, etc.) whose byte extents +/// come from the AST walker in [`format`]. +/// Pass `&[]` for the standalone path; in that case only `INLINE_TOKEN_RE` +/// regex protection applies. +/// +/// Ranges that overlap with earlier ones (or with regex matches in the same +/// position) are dropped; the first match wins. +/// +/// [`format`]: crate::format +#[must_use] +pub fn split_sentences(text: &str, atomic_ranges: &[Range]) -> Vec { + // Trim and adjust caller-provided ranges to the trimmed slice. Atomic + // ranges typically arrive aligned to `text` exactly (the AST walker + // computes them from sourcepos relative to the paragraph's start), + // but the trim is defensive. + let leading = text.len() - text.trim_start().len(); + let trimmed = text.trim(); + if trimmed.is_empty() { + return Vec::new(); + } + + // Gather every protected span: caller-provided atomic ranges first, + // then regex matches for the patterns we can't reliably get from the + // AST (bare URLs, file:// links, org-mode tokens, etc.). Dropped if + // out-of-bounds or not at char boundaries. + let mut protected: Vec> = Vec::new(); + for r in atomic_ranges { + let Some(start) = r.start.checked_sub(leading) else { + continue; + }; + let Some(end) = r.end.checked_sub(leading) else { + continue; + }; + if start < end + && end <= trimmed.len() + && trimmed.is_char_boundary(start) + && trimmed.is_char_boundary(end) + { + protected.push(start..end); + } + } + for m in INLINE_TOKEN_RE.find_iter(trimmed) { + protected.push(m.start()..m.end()); + } + // Sort by start; drop ranges that overlap with an earlier one + // (earlier always wins). + protected.sort_by_key(|r| r.start); + let mut non_overlapping: Vec> = Vec::new(); + let mut max_end = 0; + for r in protected { + if r.start >= max_end { + max_end = r.end; + non_overlapping.push(r); + } + } + + // Substitute placeholders in a single forward pass. Placeholders use + // NUL to avoid colliding with any normal text content. + // + // The atomic content goes through `fold_line_breaks` first: a span + // whose source crosses a line boundary (e.g. an italic that wraps + // across two markdown lines with a continuation indent) would + // otherwise leak the embedded `\n ` into the placeholder. textwrap + // treats `\n` as a forced break, and the downstream container + // prefix step would then add its own continuation indent on top of + // the preserved source indent — producing visibly over-indented + // output. Folding line breaks to a single space matches CommonMark's + // rendering rule for inline spans. + let mut placeholders: Vec = Vec::new(); + let mut substituted = String::with_capacity(trimmed.len()); + let mut cursor = 0; + for r in &non_overlapping { + substituted.push_str(&trimmed[cursor..r.start]); + let original = fold_line_breaks(&trimmed[r.clone()]); + let idx = placeholders.len(); + substituted.push_str(&format!("\x00PH{idx}\x00")); + placeholders.push(original); + cursor = r.end; + } + substituted.push_str(&trimmed[cursor..]); + + // Collapse runs of whitespace (newlines, tabs, multiple spaces) into a + // single space. Markdown renders soft line breaks as spaces; if we skip + // this step, embedded `\n` from the source comes through into each + // sentence and breaks textwrap's notion of where lines start. Safe to + // run after placeholder substitution because placeholders + // (`\x00PH\x00`) contain no whitespace. + let normalized = collapse_whitespace(&substituted); + + let raw_segments: Vec<&str> = normalized.unicode_sentences().collect(); + if raw_segments.is_empty() { + return vec![trimmed.to_owned()]; + } + + let merged = merge_abbreviation_splits(&raw_segments); + let merged = merge_quoted_punct_splits(merged); + + merged + .into_iter() + .map(|s| restore_placeholders(s.trim(), &placeholders)) + .filter(|s| !s.is_empty()) + .collect() +} + +/// Replace any newline (CR or LF) followed by horizontal whitespace with a +/// single space. +/// Multi-space runs that don't include a newline are left alone (matching +/// CommonMark's preservation of literal spaces in inline code, and avoiding +/// surprising changes elsewhere). +/// +/// Used to fold the contents of atomic spans (emphasis, inline code, links) +/// that happen to cross a source-line boundary before they're stored as +/// placeholders; without this, textwrap would later treat the embedded ` \n ` +/// as a forced break and the container-prefix step would double up the +/// continuation indent. +fn fold_line_breaks(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut chars = s.chars().peekable(); + while let Some(c) = chars.next() { + if c == '\n' || c == '\r' { + out.push(' '); + while chars.peek().is_some_and(|next| matches!(*next, ' ' | '\t')) { + chars.next(); + } + } else { + out.push(c); + } + } + out +} + +/// Collapse every run of Unicode whitespace into a single ASCII space. +/// Used to normalise markdown paragraph content (soft line breaks, indent on +/// continuation lines, accidental double spaces) before sentence segmentation. +fn collapse_whitespace(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut in_ws = false; + for c in s.chars() { + if c.is_whitespace() { + if !in_ws { + out.push(' '); + in_ws = true; + } + } else { + out.push(c); + in_ws = false; + } + } + out.trim().to_owned() +} + +fn restore_placeholders(s: &str, placeholders: &[String]) -> String { + let mut restored = s.to_owned(); + for (i, original) in placeholders.iter().enumerate() { + let ph = format!("\x00PH{i}\x00"); + restored = restored.replace(&ph, original); + } + restored +} + +/// Re-join consecutive segments when the earlier one ends in a known +/// abbreviation; UAX \#29 doesn't know about these and false-splits. +fn merge_abbreviation_splits(segments: &[&str]) -> Vec { + let mut result: Vec = Vec::with_capacity(segments.len()); + for &segment in segments { + let merge = result + .last() + .is_some_and(|prev| is_abbreviation_ending(prev)); + if merge { + result.last_mut().unwrap().push_str(segment); + } else { + result.push(segment.to_owned()); + } + } + result +} + +/// Re-join when a segment ends with sentence punctuation inside closing +/// quotes/parens AND the next segment starts with a lowercase letter, meaning +/// the apparent break is actually mid-sentence. +/// E.g. +/// `He said "wow!" and left.` is one sentence, not two. +fn merge_quoted_punct_splits(segments: Vec) -> Vec { + let mut result: Vec = Vec::with_capacity(segments.len()); + for segment in segments { + let merge = result.last().is_some_and(|prev| { + QUOTED_PUNCT_END_RE.is_match(prev.trim_end()) + && segment + .trim_start() + .chars() + .next() + .is_some_and(char::is_lowercase) + }); + if merge { + result.last_mut().unwrap().push_str(&segment); + } else { + result.push(segment); + } + } + result +} + +fn is_abbreviation_ending(s: &str) -> bool { + let trimmed = s.trim_end(); + if !trimmed.ends_with('.') { + return false; + } + let before_dot = &trimmed[..trimmed.len() - 1]; + ABBREV_RE.is_match(before_dot) || MULTI_ABBREV_RE.is_match(before_dot) +} + +#[cfg(test)] +#[path = "sentence_tests.rs"] +mod tests; diff --git a/crates/contrib/comfort/src/sentence_tests.rs b/crates/contrib/comfort/src/sentence_tests.rs new file mode 100644 index 00000000..73cc4bf6 --- /dev/null +++ b/crates/contrib/comfort/src/sentence_tests.rs @@ -0,0 +1,267 @@ +//! Test suite ported from snapper-fmt's `sentence/unicode.rs`, MIT-licensed, +//! Copyright (c) 2026 Rohit Goswami. +//! Verifies that comfort's inlined English splitter behaves identically to the +//! upstream English configuration. + +use pretty_assertions::assert_eq; + +use super::split_sentences; + +fn split(text: &str) -> Vec { + split_sentences(text, &[]) +} + +fn split_with_atomic(text: &str, atomic_ranges: &[std::ops::Range]) -> Vec { + split_sentences(text, atomic_ranges) +} + +#[test] +fn simple_sentences() { + assert_eq!( + split("Hello world. This is a test. Another sentence here."), + vec!["Hello world.", "This is a test.", "Another sentence here."] + ); +} + +#[test] +fn abbreviation_dr() { + assert_eq!(split("Dr. Smith went home. He was tired."), vec![ + "Dr. Smith went home.", + "He was tired." + ]); +} + +#[test] +fn abbreviation_eg() { + assert_eq!( + split("Use a formatter, e.g. snapper. It works well."), + vec!["Use a formatter, e.g. snapper.", "It works well."] + ); +} + +#[test] +fn abbreviation_fig() { + assert_eq!( + split("See Fig. 3 for details. The results are clear."), + vec!["See Fig. 3 for details.", "The results are clear."] + ); +} + +#[test] +fn empty_input() { + assert_eq!(split(""), Vec::::new()); +} + +#[test] +fn single_sentence() { + assert_eq!(split("Just one sentence."), vec!["Just one sentence."]); +} + +#[test] +fn question_and_exclamation() { + assert_eq!(split("Is this working? Yes! It is."), vec![ + "Is this working?", + "Yes!", + "It is." + ]); +} + +#[test] +fn no_trailing_period() { + assert_eq!(split("First sentence. Second without period"), vec![ + "First sentence.", + "Second without period" + ]); +} + +#[test] +fn inline_org_link_preserved() { + assert_eq!( + split("See [[https://example.com][Ex. Site]] for details. Then continue."), + vec![ + "See [[https://example.com][Ex. Site]] for details.", + "Then continue." + ] + ); +} + +#[test] +fn inline_math_preserved() { + assert_eq!(split("The value $x = 3.14$ matters. Next sentence."), vec![ + "The value $x = 3.14$ matters.", + "Next sentence." + ]); +} + +#[test] +fn inline_markdown_link_preserved() { + assert_eq!( + split("Visit [Example Inc.](https://example.com) now. Then read more."), + vec![ + "Visit [Example Inc.](https://example.com) now.", + "Then read more." + ] + ); +} + +#[test] +fn bold_span_with_period_does_not_split_mid_span() { + // Regression: `**Heading.** Body.` used to split at the period inside + // the bold span, stranding `**` on the next line. + assert_eq!(split("**Heading.** Body sentence here."), vec![ + "**Heading.** Body sentence here." + ]); +} + +#[test] +fn bold_span_with_internal_period_then_real_sentence_break() { + // The period inside the bold span doesn't break, but the period + // outside it still does. + assert_eq!(split("**Title.** First sentence. Second sentence."), vec![ + "**Title.** First sentence.", + "Second sentence.", + ]); +} + +#[test] +fn atomic_range_protects_explicit_span() { + // The caller (format.rs) marks the bold span as atomic via byte range. + // The splitter must not break inside it, even though it contains a + // sentence-terminator period. + let text = "**Heading.** Body sentence here."; + let bold = 0..text.find("** B").unwrap() + 2; // covers `**Heading.**` + assert_eq!(split_with_atomic(text, &[bold]), vec![ + "**Heading.** Body sentence here." + ]); +} + +#[test] +fn atomic_range_does_not_swallow_following_sentence_break() { + let text = "**Title.** First. Second."; + let bold = 0..text.find("** F").unwrap() + 2; + assert_eq!(split_with_atomic(text, &[bold]), vec![ + "**Title.** First.", + "Second.", + ]); +} + +#[test] +fn atomic_range_overlapping_a_regex_match_dedupes_to_first() { + // `**Heading.**` is matched by both the caller's AST atomic range AND + // the bold-regex fallback. The caller's range wins; the regex match + // gets dropped as overlapping. + let text = "**Heading.** Body."; + let bold = 0..12; // `**Heading.**` + let out = split_with_atomic(text, &[bold]); + assert_eq!(out, vec!["**Heading.** Body."]); +} + +#[test] +#[allow( + clippy::reversed_empty_ranges, + reason = "testing malformed input on purpose" +)] +fn atomic_range_out_of_bounds_is_ignored() { + // Defensive: malformed ranges shouldn't panic. + let text = "Hello world."; + let bogus = vec![100..200, 5..3]; + let out = split_with_atomic(text, &bogus); + assert_eq!(out, vec!["Hello world."]); +} + +#[test] +fn strikethrough_with_period_is_preserved() { + assert_eq!(split("~~obsolete.~~ Still here."), vec![ + "~~obsolete.~~ Still here." + ]); +} + +#[test] +fn inline_code_preserved() { + assert_eq!(split("Use `std.io.Read` for input. Then process."), vec![ + "Use `std.io.Read` for input.", + "Then process." + ]); +} + +#[test] +fn quoted_exclamation_no_false_split() { + assert_eq!(split(r#"He said "wow!" and left. She agreed."#), vec![ + r#"He said "wow!" and left."#, + "She agreed." + ]); +} + +#[test] +fn paren_exclamation_no_false_split() { + assert_eq!( + split("He replied (with emphasis!) loudly. She agreed."), + vec!["He replied (with emphasis!) loudly.", "She agreed."] + ); +} + +#[test] +fn paren_question_no_false_split() { + assert_eq!( + split("The answer (really?) surprised them. Next sentence."), + vec!["The answer (really?) surprised them.", "Next sentence."] + ); +} + +#[test] +fn url_trailing_period_not_swallowed() { + assert_eq!( + split("Visit https://example.com/path. Then read more."), + vec!["Visit https://example.com/path.", "Then read more."] + ); +} + +#[test] +fn url_with_query_trailing_period() { + assert_eq!( + split("See https://example.com/path?q=1&r=2. Next sentence."), + vec!["See https://example.com/path?q=1&r=2.", "Next sentence."] + ); +} + +#[test] +fn ellipsis_splits() { + assert_eq!(split("Sentence one... Sentence two."), vec![ + "Sentence one...", + "Sentence two." + ]); +} + +#[test] +fn soft_line_breaks_are_collapsed_to_spaces() { + // The text comes in with embedded newlines (markdown soft breaks). + // Each output sentence must be one logical line — no `\n` leakage. + let out = split("If foo, that\ntool is included. This\nprevents a problem."); + assert_eq!(out, vec![ + "If foo, that tool is included.", + "This prevents a problem.", + ]); +} + +#[test] +fn runs_of_whitespace_collapse_to_one_space() { + let out = split("First sentence. Second\n\nsentence."); + assert_eq!(out, vec!["First sentence.", "Second sentence."]); +} + +#[test] +fn inline_code_internal_whitespace_is_preserved_through_normalisation() { + // The two spaces inside the backticks survive the whitespace collapse + // because the inline code span is placeholdered first. + let out = split("Use `foo bar` for this."); + assert_eq!(out, vec!["Use `foo bar` for this."]); +} + +#[test] +fn quoted_period_end_of_sentence() { + // "done." followed by uppercase Start is a real sentence boundary. + assert_eq!(split(r#"End of quote: "done." Start again."#), vec![ + r#"End of quote: "done.""#, + "Start again." + ]); +} diff --git a/crates/contrib/comfort/src/walk.rs b/crates/contrib/comfort/src/walk.rs new file mode 100644 index 00000000..f7676ba2 --- /dev/null +++ b/crates/contrib/comfort/src/walk.rs @@ -0,0 +1,129 @@ +//! File discovery for workspace and path-based invocations. + +use std::path::{Path, PathBuf}; + +use cargo_metadata::{MetadataCommand, Package}; +use ignore::WalkBuilder; + +use crate::{Error, cli::Language}; + +/// Discover files inside the current cargo workspace, honoring `.gitignore` and +/// friends, filtering by `language`. +/// Returns paths in walker order. +/// +/// `include` and `exclude` further filter the workspace by package name. +/// When both are empty, every workspace package is walked. +/// When `include` is non-empty, only those packages are walked. +/// `exclude` always removes packages from the resulting set. +/// Either list having an unknown name produces [`Error::UnknownPackage`]. +pub fn workspace_files( + include: &[String], + exclude: &[String], + language: Language, +) -> Result, Error> { + let metadata = MetadataCommand::new().no_deps().exec()?; + + if include.is_empty() && exclude.is_empty() { + return walk_files(metadata.workspace_root.as_std_path(), language); + } + + let workspace_packages = metadata.workspace_packages(); + let available: Vec<&str> = workspace_packages.iter().map(|p| p.name.as_str()).collect(); + + validate_package_names(&available, include)?; + validate_package_names(&available, exclude)?; + + let selected = select_packages(&workspace_packages, include, exclude); + + let mut files = Vec::new(); + for pkg in selected { + let Some(dir) = pkg.manifest_path.parent() else { + continue; + }; + files.extend(walk_files(dir.as_std_path(), language)?); + } + Ok(files) +} + +/// Walk a single directory or accept a single file path. +/// Files are returned as-is (even if their extension doesn't match `language`) +/// — the caller asked for them by name. +/// Directories are walked, respecting `.gitignore`, and filtered by `language`. +/// Returns [`Error::Walk`] for walker errors (unreadable directory, symlink +/// loop, etc.) so a `--check --workspace` run can't silently exit 0 without +/// having inspected every file it was supposed to cover. +pub fn expand_path(input: &Path, language: Language) -> Result, Error> { + if input.is_dir() { + walk_files(input, language) + } else { + Ok(vec![input.to_path_buf()]) + } +} + +fn walk_files(root: &Path, language: Language) -> Result, Error> { + let mut out = Vec::new(); + for entry in WalkBuilder::new(root).standard_filters(true).build() { + let entry = entry?; + if !entry.file_type().is_some_and(|t| t.is_file()) { + continue; + } + let path = entry.into_path(); + if matches_language(&path, language) { + out.push(path); + } + } + Ok(out) +} + +/// True if a discovered file's extension falls inside the set selected by the +/// given language. +/// With [`Language::Auto`], both Rust and Markdown extensions are included; +/// with an explicit language, only that one's. +fn matches_language(path: &Path, language: Language) -> bool { + let ext = path.extension().and_then(|e| e.to_str()); + matches!( + (language, ext), + (Language::Auto, Some("rs" | "md" | "markdown")) + | (Language::Rust, Some("rs")) + | (Language::Markdown, Some("md" | "markdown")) + ) +} + +/// Apply include/exclude filters to a list of workspace packages. +fn select_packages<'a>( + packages: &'a [&'a Package], + include: &[String], + exclude: &[String], +) -> Vec<&'a Package> { + packages + .iter() + .filter(|p| should_include(p.name.as_str(), include, exclude)) + .copied() + .collect() +} + +/// Returns true if a package with the given name should be included given the +/// user's `-p`/`--exclude` selection. +/// Pure; extracted so the resolution logic can be tested without constructing +/// `cargo_metadata` types. +fn should_include(name: &str, include: &[String], exclude: &[String]) -> bool { + let included = include.is_empty() || include.iter().any(|n| n == name); + let excluded = exclude.iter().any(|n| n == name); + included && !excluded +} + +/// Confirm every name in `names` matches some entry in `available`. +/// Returns [`Error::UnknownPackage`] for the first mismatch — fail-fast on +/// typos. +fn validate_package_names(available: &[&str], names: &[String]) -> Result<(), Error> { + for name in names { + if !available.iter().any(|a| a == name) { + return Err(Error::UnknownPackage(name.clone())); + } + } + Ok(()) +} + +#[cfg(test)] +#[path = "walk_tests.rs"] +mod tests; diff --git a/crates/contrib/comfort/src/walk_tests.rs b/crates/contrib/comfort/src/walk_tests.rs new file mode 100644 index 00000000..dcb6d64f --- /dev/null +++ b/crates/contrib/comfort/src/walk_tests.rs @@ -0,0 +1,102 @@ +//! Tests for the pure parts of workspace walking: the include/exclude predicate +//! and the unknown-package validation. +//! The actual file walking and `cargo_metadata` invocation are covered by +//! end-to-end integration tests via the binary, not unit-tested here. + +use std::path::Path; + +use pretty_assertions::assert_eq; + +use super::{matches_language, should_include, validate_package_names}; +use crate::{Error, cli::Language}; + +fn names(vs: &[&str]) -> Vec { + vs.iter().map(|s| (*s).to_owned()).collect() +} + +#[test] +fn empty_include_empty_exclude_keeps_every_package() { + assert!(should_include("foo", &[], &[])); + assert!(should_include("bar", &[], &[])); +} + +#[test] +fn explicit_include_restricts_to_listed_packages() { + let include = names(&["foo", "bar"]); + assert!(should_include("foo", &include, &[])); + assert!(should_include("bar", &include, &[])); + assert!(!should_include("baz", &include, &[])); +} + +#[test] +fn exclude_alone_keeps_unlisted_packages() { + let exclude = names(&["baz"]); + assert!(should_include("foo", &[], &exclude)); + assert!(!should_include("baz", &[], &exclude)); +} + +#[test] +fn exclude_takes_precedence_over_include() { + // A package both included AND excluded is excluded. This matches + // `cargo check --workspace --exclude foo`-style semantics. + let include = names(&["foo", "bar"]); + let exclude = names(&["foo"]); + assert!(!should_include("foo", &include, &exclude)); + assert!(should_include("bar", &include, &exclude)); +} + +#[test] +fn validate_succeeds_when_every_name_matches() { + let available = ["foo", "bar", "baz"]; + let names = names(&["bar", "baz"]); + assert!(validate_package_names(&available, &names).is_ok()); +} + +#[test] +fn validate_succeeds_on_empty_input() { + let available = ["foo"]; + assert!(validate_package_names(&available, &[]).is_ok()); +} + +#[test] +fn language_auto_accepts_both_rust_and_markdown() { + assert!(matches_language(Path::new("foo.rs"), Language::Auto)); + assert!(matches_language(Path::new("foo.md"), Language::Auto)); + assert!(matches_language(Path::new("foo.markdown"), Language::Auto)); + assert!(!matches_language(Path::new("foo.txt"), Language::Auto)); + assert!(!matches_language(Path::new("Cargo.toml"), Language::Auto)); +} + +#[test] +fn language_rust_filters_to_rs_only() { + assert!(matches_language(Path::new("foo.rs"), Language::Rust)); + assert!(!matches_language(Path::new("foo.md"), Language::Rust)); + assert!(!matches_language(Path::new("foo.markdown"), Language::Rust)); +} + +#[test] +fn language_markdown_filters_to_md_and_markdown() { + assert!(matches_language(Path::new("foo.md"), Language::Markdown)); + assert!(matches_language( + Path::new("foo.markdown"), + Language::Markdown + )); + assert!(!matches_language(Path::new("foo.rs"), Language::Markdown)); +} + +#[test] +fn language_filter_skips_files_without_extension() { + assert!(!matches_language(Path::new("README"), Language::Auto)); + assert!(!matches_language(Path::new("Makefile"), Language::Rust)); + assert!(!matches_language(Path::new("LICENSE"), Language::Markdown)); +} + +#[test] +fn validate_returns_first_unknown_name() { + let available = ["foo", "bar"]; + let lookup = names(&["bar", "ghost"]); + match validate_package_names(&available, &lookup) { + Err(Error::UnknownPackage(name)) => assert_eq!(name, "ghost"), + other => panic!("expected UnknownPackage, got {other:?}"), + } +} diff --git a/deny.toml b/deny.toml index 23ae1ce9..16d24209 100644 --- a/deny.toml +++ b/deny.toml @@ -15,6 +15,7 @@ allow = [ "MIT", "MPL-2.0", "Unicode-3.0", + "Unicode-DFS-2016", "Zlib", ] @@ -33,6 +34,4 @@ allow-git = [ "https://github.com/JeanMertz/inquire?branch=submit-on-valid-parse", "https://github.com/JeanMertz/openai-responses-rs", "https://github.com/JeanMertz/saphyr?branch=jean/fix-valid-literal-block-scalar-check", - "https://github.com/JeanMertz/schematic?branch=merged", - "https://github.com/zkat/miette", ] diff --git a/justfile b/justfile index 32e19204..79a9c02e 100644 --- a/justfile +++ b/justfile @@ -1631,7 +1631,7 @@ plugin-build-local: _install-jp (plugin-build "") # Run all ci tasks. [group('ci')] -ci: lint-ci fmt-ci test-ci docs-ci coverage-ci deny-ci insta-ci shear-ci vet-ci +ci: lint-ci fmt-ci fmt-comments-ci test-ci docs-ci coverage-ci deny-ci insta-ci shear-ci vet-ci # Lint the code on CI. [group('ci')] @@ -1643,6 +1643,11 @@ lint-ci: (_rustup_component "clippy") _install_ci_matchers fmt-ci: (_rustup_component "rustfmt") _install_ci_matchers cargo fmt --all --check +# Check Rust doc-comment formatting on CI. +[group('ci')] +fmt-comments-ci: _install-comfort _install_ci_matchers + comfort --check --workspace + # Test the code on CI. [group('ci')] test-ci: (_install "cargo-nextest@" + nextest_version) _install_ci_matchers @@ -1706,6 +1711,9 @@ vet-ci: (_install "cargo-vet@" + vet_version) @_install-jp *args: cargo install {{quiet_flag}} --locked --path crates/jp_cli {{args}} +@_install-comfort *args: + cargo install {{quiet_flag}} --locked --path crates/contrib/comfort {{args}} + @_install-binstall: command -v cargo-binstall >/dev/null 2>&1 || { \ curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | BINSTALL_VERSION={{binstall_version}} sh; \