CodSpeedHQ · not-matthias · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -62,6 +62,7 @@ futures = "0.3.31"
 runner-shared = { path = "crates/runner-shared" }
 memtrack = { path = "crates/memtrack", default-features = false }
 exec-harness = { path = "crates/exec-harness" }
+instrument-hooks-bindings = { path = "crates/instrument-hooks-bindings" }
 ipc-channel = "0.18"
 shellexpand = { version = "3.1.1", features = ["tilde"] }
 addr2line = "0.25"
@@ -74,6 +75,7 @@ rmp-serde = "1.3.0"
 uuid = { version = "1.21.0", features = ["v4"] }
 which = "8.0.2"
 crc32fast = "1.5.0"
+samply = { git = "https://github.com/AvalancheHQ/samply", branch = "codspeed" }
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs = "0.17.0"

diff --git a/crates/runner-shared/src/fifo.rs b/crates/runner-shared/src/fifo.rs
@@ -18,20 +18,20 @@ const _: () = assert!(
 /// The different markers that can be set in the perf.data.
 ///
 /// `SampleStart/End`: Marks the start and end of a sampling period. This is used to differentiate between benchmarks.
-/// `BenchmarkStart/End`: Marks the start and end of a benchmark. This is used to measure the duration of a benchmark, without the benchmark harness code.
+/// `RoundStart/End`: Marks the start and end of a measured round. This is used to measure the duration of a benchmark, without the benchmark harness code.
 #[derive(
     serde::Serialize, serde::Deserialize, Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone,
 )]
 pub enum MarkerType {
     SampleStart(u64),
     SampleEnd(u64),
-    BenchmarkStart(u64),
-    BenchmarkEnd(u64),
+    RoundStart(u64),
+    RoundEnd(u64),
 }
 
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq, Eq)]
 pub enum IntegrationMode {
-    Perf,
+    Walltime,
     Simulation,
     Analysis,
 }
@@ -42,11 +42,11 @@ pub enum Command {
         pid: i32,
         uri: String,
     },
-    StartBenchmark,
-    StopBenchmark,
+    StartProfiler,
+    StopProfiler,
     Ack,
     #[deprecated(note = "Use `GetIntegrationMode` instead")]
-    PingPerf,
+    PingProfiler,
     SetIntegration {
         name: String,
         version: String,

diff --git a/crates/runner-shared/src/metadata.rs b/crates/runner-shared/src/metadata.rs
@@ -12,7 +12,7 @@ use crate::module_symbols::MappedProcessModuleSymbols;
 use crate::unwind_data::MappedProcessUnwindData;
 
 #[derive(Serialize, Deserialize, Default)]
-pub struct PerfMetadata {
+pub struct WalltimeMetadata {
     /// The version of this metadata format.
     pub version: u64,
 
@@ -71,13 +71,13 @@ pub struct PerfMetadata {
     pub debug_info_by_pid: HashMap<pid_t, Vec<ModuleDebugInfo>>,
 }
 
-impl PerfMetadata {
+impl WalltimeMetadata {
     pub fn from_reader<R: std::io::Read>(reader: R) -> anyhow::Result<Self> {
-        serde_json::from_reader(reader).context("Could not parse perf metadata from JSON")
+        serde_json::from_reader(reader).context("Could not parse walltime metadata from JSON")
     }
 
     pub fn save_to<P: AsRef<Path>>(&self, path: P) -> anyhow::Result<()> {
-        let file = std::fs::File::create(path.as_ref().join("perf.metadata"))?;
+        let file = std::fs::File::create(path.as_ref().join("walltime.metadata"))?;
         const BUFFER_SIZE: usize = 256 * 1024 /* 256 KB */;
 
         let writer = BufWriter::with_capacity(BUFFER_SIZE, file);

diff --git a/src/cli/exec/mod.rs b/src/cli/exec/mod.rs
@@ -77,8 +77,8 @@ fn build_orchestrator_config(
         targets: vec![target],
         modes,
         instruments: Instruments { mongodb: None }, // exec doesn't support MongoDB
-        perf_unwinding_mode: args.shared.perf_run_args.perf_unwinding_mode,
-        enable_perf: args.shared.perf_run_args.enable_perf,
+        perf_unwinding_mode: args.shared.profiler_run_args.perf.perf_unwinding_mode,
+        enable_profiler: args.shared.profiler_run_args.resolve_enable_profiler(),
         simulation_tool: args.shared.simulation_tool.unwrap_or_default(),
         profile_folder: args.shared.profile_folder,
         skip_upload: args.shared.skip_upload,

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
@@ -2,6 +2,7 @@ mod auth;
 pub(crate) mod exec;
 pub(crate) mod experimental;
 pub(crate) mod run;
+mod samply;
 mod setup;
 mod shared;
 mod show;
@@ -117,6 +118,17 @@ enum Commands {
     Show,
     /// Update the CodSpeed CLI to the latest version
     Update,
+
+    #[command(flatten)]
+    Internal(InternalCommands),
+}
+
+/// Subcommands the CLI uses to re-invoke itself; not user-facing entry points.
+#[derive(Subcommand, Debug)]
+enum InternalCommands {
+    /// Run the bundled samply profiler. Args are forwarded to samply.
+    #[command(disable_help_flag = true, disable_help_subcommand = true)]
+    Samply(samply::SamplyArgs),
 }
 
 pub async fn run() -> Result<()> {
@@ -141,7 +153,7 @@ pub async fn run() -> Result<()> {
     let setup_cache_dir = setup_cache_dir.as_deref();
 
     match cli.command {
-        Commands::Run(_) | Commands::Exec(_) => {} // Run and Exec are responsible for their own logger initialization
+        Commands::Run(_) | Commands::Exec(_) | Commands::Internal(InternalCommands::Samply(_)) => {} // these are responsible for their own logger initialization
         _ => {
             init_local_logger()?;
         }
@@ -176,6 +188,7 @@ pub async fn run() -> Result<()> {
         Commands::Use(args) => use_mode::run(args)?,
         Commands::Show => show::run()?,
         Commands::Update => update::run().await?,
+        Commands::Internal(InternalCommands::Samply(args)) => samply::run(args)?,
     }
     Ok(())
 }

diff --git a/src/cli/run/mod.rs b/src/cli/run/mod.rs
@@ -48,8 +48,8 @@ pub enum MessageFormat {
 impl RunArgs {
     /// Constructs a new `RunArgs` with default values for testing purposes
     pub fn test() -> Self {
-        use super::PerfRunArgs;
         use super::experimental::ExperimentalArgs;
+        use super::{PerfRunArgs, ProfilerRunArgs};
         use crate::RunnerMode;
 
         Self {
@@ -69,9 +69,12 @@ impl RunArgs {
                 go_runner_version: None,
                 show_full_output: false,
                 base: None,
-                perf_run_args: PerfRunArgs {
-                    enable_perf: false,
-                    perf_unwinding_mode: None,
+                profiler_run_args: ProfilerRunArgs {
+                    enable_profiler: false,
+                    enable_perf: None,
+                    perf: PerfRunArgs {
+                        perf_unwinding_mode: None,
+                    },
                 },
                 experimental: ExperimentalArgs {
                     experimental_fair_sched: false,
@@ -111,8 +114,8 @@ fn build_orchestrator_config(
         targets,
         modes,
         instruments,
-        perf_unwinding_mode: args.shared.perf_run_args.perf_unwinding_mode,
-        enable_perf: args.shared.perf_run_args.enable_perf,
+        perf_unwinding_mode: args.shared.profiler_run_args.perf.perf_unwinding_mode,
+        enable_profiler: args.shared.profiler_run_args.resolve_enable_profiler(),
         simulation_tool: args.shared.simulation_tool.unwrap_or_default(),
         profile_folder: args.shared.profile_folder,
         skip_upload: args.shared.skip_upload,

diff --git a/src/cli/samply.rs b/src/cli/samply.rs
@@ -0,0 +1,37 @@
+use clap::Parser;
+
+use crate::prelude::*;
+
+/// Run the bundled samply profiler. Arguments after `samply` are forwarded
+/// verbatim to samply's own CLI parser.
+#[derive(Debug, clap::Args)]
+pub struct SamplyArgs {
+    #[arg(trailing_var_arg = true, allow_hyphen_values = true)]
+    args: Vec<std::ffi::OsString>,
+}
+
+pub fn run(args: SamplyArgs) -> Result<()> {
+    use ::samply::cli;
+
+    let argv = std::iter::once(std::ffi::OsString::from("samply")).chain(args.args);
+    let opt = cli::Opt::parse_from(argv);
+
+    // samply spins up its own tokio runtime internally, so it must run on a
+    // thread that isn't already inside our `#[tokio::main]` runtime.
+    std::thread::scope(|s| {
+        s.spawn(|| match opt.action {
+            #[cfg(any(
+                target_os = "android",
+                target_os = "macos",
+                target_os = "linux",
+                target_os = "windows"
+            ))]
+            cli::Action::Record(a) => ::samply::do_record_action(a),
+            _ => unimplemented!("Only `samply record` is supported"),
+        })
+        .join()
+        .map_err(|_| anyhow::anyhow!("samply thread panicked"))
+    })?;
+
+    Ok(())
+}
diff --git a/src/cli/shared.rs b/src/cli/shared.rs
@@ -109,7 +109,7 @@ pub struct ExecAndRunSharedArgs {
     pub base: Option<String>,
 
     #[command(flatten)]
-    pub perf_run_args: PerfRunArgs,
+    pub profiler_run_args: ProfilerRunArgs,
 
     #[command(flatten)]
     pub experimental: ExperimentalArgs,
@@ -151,17 +151,41 @@ pub enum UnwindingMode {
 }
 
 #[derive(Args, Debug, Clone)]
-pub struct PerfRunArgs {
-    /// Enable the linux perf profiler to collect granular performance data.
+pub struct ProfilerRunArgs {
+    /// Enable a profiler to collect granular performance data.
     /// This is only supported on Linux.
-    #[arg(long, env = "CODSPEED_PERF_ENABLED", default_value_t = true)]
-    pub enable_perf: bool,
+    #[arg(long, env = "CODSPEED_PROFILER_ENABLED", default_value_t = true)]
+    pub enable_profiler: bool,
+
+    /// Deprecated alias for --enable-profiler / CODSPEED_PROFILER_ENABLED.
+    #[arg(long, env = "CODSPEED_PERF_ENABLED", hide = true)]
+    pub enable_perf: Option<bool>,
+
+    #[command(flatten)]
+    pub perf: PerfRunArgs,
+}
 
+#[derive(Args, Debug, Clone)]
+pub struct PerfRunArgs {
     /// The unwinding mode that should be used with perf to collect the call stack.
     #[arg(long, env = "CODSPEED_PERF_UNWINDING_MODE")]
     pub perf_unwinding_mode: Option<UnwindingMode>,
 }
 
+impl ProfilerRunArgs {
+    /// Resolves the effective `enable_profiler` value, honoring the deprecated
+    /// `--enable-perf` / `CODSPEED_PERF_ENABLED` flag with a warning.
+    pub fn resolve_enable_profiler(&self) -> bool {
+        let Some(legacy) = self.enable_perf else {
+            return self.enable_profiler;
+        };
+        log::warn!(
+            "CODSPEED_PERF_ENABLED / --enable-perf is deprecated; use CODSPEED_PROFILER_ENABLED / --enable-profiler instead."
+        );
+        legacy
+    }
+}
+
 /// Parser for go-runner version that validates semver format
 fn parse_version(s: &str) -> Result<semver::Version, String> {
     semver::Version::parse(s).map_err(|e| format!("Invalid semantic version: {e}"))

diff --git a/src/executor/config.rs b/src/executor/config.rs
@@ -59,7 +59,7 @@ pub struct OrchestratorConfig {
 
     pub modes: Vec<RunnerMode>,
     pub instruments: Instruments,
-    pub enable_perf: bool,
+    pub enable_profiler: bool,
     /// Stack unwinding mode for perf (if enabled)
     pub perf_unwinding_mode: Option<UnwindingMode>,
 
@@ -96,7 +96,7 @@ pub struct ExecutorConfig {
     pub command: String,
 
     pub instruments: Instruments,
-    pub enable_perf: bool,
+    pub enable_profiler: bool,
     /// Stack unwinding mode for perf (if enabled)
     pub perf_unwinding_mode: Option<UnwindingMode>,
 
@@ -181,7 +181,7 @@ impl OrchestratorConfig {
             working_directory: self.working_directory.clone(),
             command,
             instruments: self.instruments.clone(),
-            enable_perf: self.enable_perf,
+            enable_profiler: self.enable_profiler,
             perf_unwinding_mode: self.perf_unwinding_mode,
             simulation_tool: self.simulation_tool,
             skip_run: self.skip_run,
@@ -217,7 +217,7 @@ impl OrchestratorConfig {
             modes: vec![RunnerMode::Simulation],
             instruments: Instruments::test(),
             perf_unwinding_mode: None,
-            enable_perf: false,
+            enable_profiler: false,
             simulation_tool: SimulationTool::default(),
             profile_folder: None,
             skip_upload: false,

diff --git a/src/executor/helpers/env.rs b/src/executor/helpers/env.rs
@@ -24,7 +24,8 @@ pub fn get_base_injected_env(
         ("PYTHONHASHSEED".into(), "0".into()),
         (
             "PYTHON_PERF_JIT_SUPPORT".into(),
-            if mode == RunnerMode::Walltime {
+            // IMPORTANT: We must not enable this, otherwise we'll have many unresolved addresses on the stack on MacOS
+            if mode == RunnerMode::Walltime && !cfg!(target_os = "macos") {
                 "1".into()
             } else {
                 "0".into()

diff --git a/src/executor/memory/executor.rs b/src/executor/memory/executor.rs
@@ -93,7 +93,7 @@ impl Executor for MemoryExecutor {
     }
 
     async fn run(
-        &self,
+        &mut self,
         execution_context: &ExecutionContext,
         _mongo_tracer: &Option<MongoTracer>,
     ) -> Result<()> {
@@ -207,14 +207,14 @@ impl MemoryExecutor {
                         );
                     }
                 }
-                FifoCommand::StartBenchmark => {
+                FifoCommand::StartProfiler => {
                     debug!("Enabling memtrack via IPC");
                     if let Err(e) = ipc_client.enable() {
                         error!("Failed to enable memtrack: {e}");
                         return Ok(Some(FifoCommand::Err));
                     }
                 }
-                FifoCommand::StopBenchmark => {
+                FifoCommand::StopProfiler => {
                     debug!("Disabling memtrack via IPC");
                     if let Err(e) = ipc_client.disable() {
                         // There's a chance that memtrack has already exited here, so just log as debug

diff --git a/src/executor/mod.rs b/src/executor/mod.rs
@@ -106,7 +106,7 @@ pub trait Executor {
 
     /// Runs the executor
     async fn run(
-        &self,
+        &mut self,
         execution_context: &ExecutionContext,
         // TODO: use Instruments instead of directly passing the mongodb tracer
         mongo_tracer: &Option<MongoTracer>,
@@ -118,7 +118,7 @@ pub trait Executor {
 /// Run a single executor: setup → run → teardown → persist logs.
 /// Does NOT upload.
 pub async fn run_executor(
-    executor: &dyn Executor,
+    executor: &mut dyn Executor,
     orchestrator: &Orchestrator,
     execution_context: &ExecutionContext,
     setup_cache_dir: Option<&Path>,

diff --git a/src/executor/orchestrator.rs b/src/executor/orchestrator.rs
@@ -157,7 +157,7 @@ impl Orchestrator {
             let config = self
                 .config
                 .executor_config_for_command(part.command, !part.uses_exec_harness);
-            let executor = get_executor_from_mode(part.mode);
+            let mut executor = get_executor_from_mode(part.mode);
             let profile_folder =
                 self.resolve_profile_folder(&executor.name(), run_part_index, total_parts)?;
 
@@ -167,7 +167,7 @@ impl Orchestrator {
                 activate_rolling_buffer(&part.label);
             }
 
-            run_executor(executor.as_ref(), self, &ctx, setup_cache_dir).await?;
+            run_executor(executor.as_mut(), self, &ctx, setup_cache_dir).await?;
 
             if !self.config.show_full_output {
                 deactivate_rolling_buffer();