From 38fa437ac958de3097dd90ed213eb5b83ea546cc Mon Sep 17 00:00:00 2001 From: zackees Date: Sun, 21 Jun 2026 17:22:29 -0700 Subject: [PATCH] perf(build): default to dev profile + rust-lld for ~5x faster rebuild on Rust edits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup.py was always invoking `soldr cargo build --release -p fbuild-cli`, so every `pip install`/`uv sync` rebuild went through the full release codegen pipeline (opt-level=3 + LLVM passes + link.exe) — about 100s per real source edit even with a hot cache. The shipped wheel needs that pipeline; the dev iteration loop does not. Three coordinated changes: - `setup.py`: build with the dev profile by default. Pass `--release` only when `FBUILD_BUILD_RELEASE=1` is set, so packaging / perf-test flows can still produce an optimized binary. The PyPI release path bypasses setup.py entirely (it calls cargo zigbuild --release direct in `release-auto.yml`), so this only affects local installs. - `Cargo.toml`: `[profile.dev.package."*"] opt-level = 3` so third-party deps stay optimized even in the dev profile — they compile once on first install and the runtime hot paths (serde, tokio, reqwest) keep their performance. Only fbuild's own ~15 crates compile at opt-level=0, which is exactly where edit-then-rebuild churn happens. - `.cargo/config.toml`: `[target.x86_64-pc-windows-msvc] linker = "rust-lld.exe"`. Ships with the Rust toolchain, ~2-5x faster than link.exe on link-heavy rebuilds. Cross-profile, cross-tool. Measurements (semantic edit to `crates/fbuild-core/src/lib.rs`, then `uv run python --version`): | Profile | Time | |--------------------------|-------:| | Release (was the default)| 100.1s | | Dev (new default) | 18.9s | 5.3x speedup on the path the user actually edits in. Touch-only edits remain dominated by soldr+uv overhead (~14s) because zccache hits cover the rustc work and dev-vs-release codegen never runs — those weren't slower to begin with, the cost there is process-spawn overhead. If you ever need the optimized CLI locally (perf testing, debugging a real-world slowdown): FBUILD_BUILD_RELEASE=1 uv sync --reinstall-package fbuild Co-Authored-By: Claude Opus 4.7 (1M context) --- .cargo/config.toml | 8 ++++++++ Cargo.toml | 14 ++++++++++++++ setup.py | 29 ++++++++++++++++++++++++----- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 66fee73b..3dff0375 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -4,3 +4,11 @@ protocol = "sparse" [net] retry = 2 git-fetch-with-cli = true + +# Use `rust-lld` instead of MSVC `link.exe` on Windows. rust-lld ships +# with the Rust toolchain (no install), is typically 2-5x faster than +# link.exe, and produces the same executable format. Applied to all +# profiles — release builds also benefit. If something ever fails to +# link, override per-invocation with `RUSTFLAGS="-C linker=link.exe"`. +[target.x86_64-pc-windows-msvc] +linker = "rust-lld.exe" diff --git a/Cargo.toml b/Cargo.toml index cdb0e039..9be360d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,3 +114,17 @@ running-process = { version = "4.3.0", default-features = false, features = ["cl [profile.dev] debug = 0 +# Cargo's dev defaults: opt-level = 0 for first-party (fast compile + +# fast link), incremental = true (cargo + linker keep state across +# builds), codegen-units = 256 (parallel compile units). We accept +# those for our own crates so `pip install`/`uv sync` rebuilds finish +# quickly when iterating. + +# Optimize third-party deps even in dev so runtime perf doesn't tank +# just because we want a fast local-rebuild loop. Cargo compiles each +# upstream crate exactly once and caches it, so this only costs the +# first build — every subsequent rebuild of fbuild's source is still +# the fast dev-profile path. +[profile.dev.package."*"] +opt-level = 3 +debug = false diff --git a/setup.py b/setup.py index 4a8d44a3..f6ad6168 100644 --- a/setup.py +++ b/setup.py @@ -173,16 +173,34 @@ def _find_fbuild_executable_from_json(stdout: str) -> Optional[Path]: return binary_path +def _use_release_profile() -> bool: + """True when this build should produce a release-optimized binary. + + Default is `False` — pip/uv-driven builds use the dev profile so the + iteration loop is fast (workspace's third-party deps stay at opt-level + 3 via `[profile.dev.package."*"]`, only our own crates compile + unoptimized). Set `FBUILD_BUILD_RELEASE=1` to opt into a release + build when you actually want a fast binary (CI, packaging, perf + tests). + """ + return os.environ.get("FBUILD_BUILD_RELEASE", "").lower() in ("1", "true", "yes") + + +def _profile_subdir() -> str: + return "release" if _use_release_profile() else "debug" + + def _find_fbuild_executable_by_search() -> Optional[Path]: """Fallback when cargo didn't emit a usable artifact line (e.g. a fully cached build that reports `Fresh` and skips compiler-artifact). Probe - the canonical `target/release` path and every per-host-triple subdir. + the canonical `target/` path and every per-host-triple subdir. """ - candidates = [REPO_ROOT / "target" / "release" / TARGET_BINARY_NAME] - target_root = REPO_ROOT / "target" + profile_dir = _profile_subdir() + target_root = Path(os.environ.get("CARGO_TARGET_DIR", REPO_ROOT / "target")) + candidates = [target_root / profile_dir / TARGET_BINARY_NAME] if target_root.is_dir(): for child in target_root.iterdir(): - candidate = child / "release" / TARGET_BINARY_NAME + candidate = child / profile_dir / TARGET_BINARY_NAME if candidate.is_file(): candidates.append(candidate) for candidate in candidates: @@ -197,11 +215,12 @@ def _build_fbuild_cli() -> Path: "soldr", "cargo", "build", - "--release", "-p", "fbuild-cli", "--message-format=json-render-diagnostics", ] + if _use_release_profile(): + cmd.insert(3, "--release") sys.stderr.write(f" $ {' '.join(cmd)}\n") # stderr passes through so soldr's session summary stays visible; stdout # is captured because that's where cargo writes its JSON artifact stream.