From ff9f5e44f0735eb69f95fa21d53c909040843987 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 15:26:41 -0700
Subject: [PATCH 01/53] c++ side changes

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 src/ray/core_worker/core_worker.cc            |  24 +++
 src/ray/core_worker/core_worker.h             |  23 +++
 src/ray/core_worker/core_worker_process.cc    |  23 +++
 src/ray/core_worker/core_worker_process.h     |  12 ++
 .../core_worker_shutdown_executor.cc          |   6 +
 src/ray/core_worker/tests/core_worker_test.cc | 145 ++++++++++++++++++
 6 files changed, 233 insertions(+)
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 7257247b5344..a3a30ba6e788 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -291,6 +291,7 @@ CoreWorker::CoreWorker(
     CoreWorkerOptions options,
     std::unique_ptr<WorkerContext> worker_context,
     instrumented_io_context &io_service,
+    instrumented_io_context &object_freed_callback_service,
     std::shared_ptr<rpc::CoreWorkerClientPool> core_worker_client_pool,
     std::shared_ptr<rpc::RayletClientPool> raylet_client_pool,
     std::shared_ptr<PeriodicalRunnerInterface> periodical_runner,
@@ -300,6 +301,7 @@ CoreWorker::CoreWorker(
     std::shared_ptr<ipc::RayletIpcClientInterface> raylet_ipc_client,
     std::shared_ptr<RayletClientInterface> local_raylet_rpc_client,
     boost::thread &io_thread,
+    boost::thread &object_freed_callback_thread,
     std::shared_ptr<ReferenceCounterInterface> reference_counter,
     std::shared_ptr<CoreWorkerMemoryStore> memory_store,
     std::shared_ptr<CoreWorkerPlasmaStoreProvider> plasma_store_provider,
@@ -326,6 +328,7 @@ CoreWorker::CoreWorker(
                          : nullptr),
       worker_context_(std::move(worker_context)),
       io_service_(io_service),
+      object_freed_callback_service_(object_freed_callback_service),
       core_worker_client_pool_(std::move(core_worker_client_pool)),
       raylet_client_pool_(std::move(raylet_client_pool)),
       periodical_runner_(std::move(periodical_runner)),
@@ -335,6 +338,7 @@ CoreWorker::CoreWorker(
       raylet_ipc_client_(std::move(raylet_ipc_client)),
       local_raylet_rpc_client_(std::move(local_raylet_rpc_client)),
       io_thread_(io_thread),
+      object_freed_callback_thread_(object_freed_callback_thread),
       reference_counter_(std::move(reference_counter)),
       memory_store_(std::move(memory_store)),
       plasma_store_provider_(std::move(plasma_store_provider)),
@@ -2475,6 +2479,26 @@ bool CoreWorker::IsTaskCanceled(const TaskID &task_id) const {
   return canceled_tasks_.find(task_id) != canceled_tasks_.end();
 }
 
+bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
+    const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback) {
+  // Wrap so the actual callback runs on the dedicated thread.
+  // The wrapper itself is quick (just a post) and safe to call under the
+  // ReferenceCounter mutex.
+  auto wrapped = [this, callback](const ObjectID &id) {
+    object_freed_callback_service_.post([callback, id]() { callback(id); },
+                                        "CoreWorker.ObjFreedCb");
+  };
+  return reference_counter_->AddObjectOutOfScopeOrFreedCallback(object_id, wrapped);
+}
+
+bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
+                                                    void (*callback)(const ObjectID &,
+                                                                     void *),
+                                                    void *user_data) {
+  return AddObjectOutOfScopeOrFreedCallback(
+      object_id, [callback, user_data](const ObjectID &id) { callback(id, user_data); });
+}
+
 Status CoreWorker::CancelChildren(const TaskID &task_id, bool force_kill) {
   absl::flat_hash_set<TaskID> unknown_child_task_ids;
   auto child_task_ids = task_manager_->GetPendingChildrenTasks(task_id);
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 83f25904ad08..de924e3a0f0f 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -173,6 +173,7 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   CoreWorker(CoreWorkerOptions options,
              std::unique_ptr<WorkerContext> worker_context,
              instrumented_io_context &io_service,
+             instrumented_io_context &object_freed_callback_service,
              std::shared_ptr<rpc::CoreWorkerClientPool> core_worker_client_pool,
              std::shared_ptr<rpc::RayletClientPool> raylet_client_pool,
              std::shared_ptr<PeriodicalRunnerInterface> periodical_runner,
@@ -182,6 +183,7 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
              std::shared_ptr<ipc::RayletIpcClientInterface> raylet_ipc_client,
              std::shared_ptr<ray::RayletClientInterface> local_raylet_rpc_client,
              boost::thread &io_thread,
+             boost::thread &object_freed_callback_thread,
              std::shared_ptr<ReferenceCounterInterface> reference_counter,
              std::shared_ptr<CoreWorkerMemoryStore> memory_store,
              std::shared_ptr<CoreWorkerPlasmaStoreProvider> plasma_store_provider,
@@ -1482,6 +1484,21 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
     reference_counter_->AddLocalReference(object_id, call_site);
   }
 
+  /// Register a callback to fire when an object goes out of scope or is freed.
+  ///
+  /// The callback is posted to the dedicated object_freed_callback_service_ thread
+  /// so it never blocks the main IO thread.
+  ///
+  /// \return true if the callback was registered; false if the object is already
+  ///         out of scope or was explicitly freed (callback will never fire).
+  bool AddObjectOutOfScopeOrFreedCallback(
+      const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback);
+
+  /// C function-pointer overload for use from Cython.
+  bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
+                                          void (*callback)(const ObjectID &, void *),
+                                          void *user_data);
+
   /// Stops the children tasks from the given TaskID
   ///
   /// \param[in] task_id of the parent task
@@ -1751,6 +1768,9 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   /// Event loop where the IO events are handled. e.g. async GCS operations.
   instrumented_io_context &io_service_;
 
+  /// Dedicated event loop for object-freed callbacks, keeping them off io_service_.
+  instrumented_io_context &object_freed_callback_service_;
+
   /// Shared core worker client pool.
   std::shared_ptr<rpc::CoreWorkerClientPool> core_worker_client_pool_;
 
@@ -1777,6 +1797,9 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   // Thread that runs a boost::asio service to process IO events.
   boost::thread &io_thread_;
 
+  /// Dedicated thread for user-registered object-freed callbacks.
+  boost::thread &object_freed_callback_thread_;
+
   // Keeps track of object ID reference counts.
   std::shared_ptr<ReferenceCounterInterface> reference_counter_;
 
diff --git a/src/ray/core_worker/core_worker_process.cc b/src/ray/core_worker/core_worker_process.cc
index 88f0001fea42..08613ce3ecbb 100644
--- a/src/ray/core_worker/core_worker_process.cc
+++ b/src/ray/core_worker/core_worker_process.cc
@@ -189,6 +189,26 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
     RAY_LOG(INFO) << "Core worker main io service stopped.";
   });
 
+  // Start the dedicated callback thread for user-registered object-freed callbacks.
+  // Python code invoked from callbacks may call into numpy or other libraries that
+  // need a large stack; give it the same 16 MB as the IO thread on Mac.
+  boost::thread::attributes obj_freed_cb_thread_attrs;
+#if defined(__APPLE__)
+  obj_freed_cb_thread_attrs.set_stack_size(16777216);
+#endif
+  object_freed_callback_thread_ = boost::thread(obj_freed_cb_thread_attrs, [this]() {
+#ifndef _WIN32
+    sigset_t mask;
+    sigemptyset(&mask);
+    sigaddset(&mask, SIGINT);
+    sigaddset(&mask, SIGTERM);
+    pthread_sigmask(SIG_BLOCK, &mask, nullptr);
+#endif
+    SetThreadName("worker.obj_freed_cb");
+    object_freed_callback_service_.run();
+    RAY_LOG(INFO) << "Object-freed callback service stopped.";
+  });
+
   if (options.worker_type == WorkerType::DRIVER &&
       !options.serialized_job_config.empty()) {
     // Driver populates the job config via initialization.
@@ -682,6 +702,7 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
       std::make_shared<CoreWorker>(std::move(options),
                                    std::move(worker_context),
                                    io_service_,
+                                   object_freed_callback_service_,
                                    std::move(core_worker_client_pool),
                                    std::move(raylet_client_pool),
                                    std::move(periodical_runner),
@@ -691,6 +712,7 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
                                    std::move(raylet_ipc_client),
                                    std::move(local_raylet_rpc_client),
                                    io_thread_,
+                                   object_freed_callback_thread_,
                                    std::move(reference_counter),
                                    std::move(memory_store),
                                    std::move(plasma_store_provider),
@@ -722,6 +744,7 @@ CoreWorkerProcessImpl::CoreWorkerProcessImpl(const CoreWorkerOptions &options)
                      ? ComputeDriverIdFromJob(options_.job_id)
                      : options_.worker_id),
       io_work_(io_service_.get_executor()),
+      object_freed_callback_service_work_(object_freed_callback_service_.get_executor()),
       client_call_manager_(std::make_unique<rpc::ClientCallManager>(
           io_service_, /*record_stats=*/false, options.node_ip_address)),
       task_execution_service_work_(task_execution_service_.get_executor()),
diff --git a/src/ray/core_worker/core_worker_process.h b/src/ray/core_worker/core_worker_process.h
index 85398b27baee..94d0446299e0 100644
--- a/src/ray/core_worker/core_worker_process.h
+++ b/src/ray/core_worker/core_worker_process.h
@@ -162,6 +162,15 @@ class CoreWorkerProcessImpl {
   /// Keeps the io_service_ alive.
   boost::asio::executor_work_guard<boost::asio::io_context::executor_type> io_work_;
 
+  /// Dedicated io_context for out-of-scope callbacks registered by users (e.g. Python).
+  instrumented_io_context object_freed_callback_service_{
+      /*enable_lag_probe=*/false,
+      /*running_on_single_thread=*/true};
+
+  /// Keeps object_freed_callback_service_ alive until explicitly stopped.
+  boost::asio::executor_work_guard<boost::asio::io_context::executor_type>
+      object_freed_callback_service_work_;
+
   /// Shared client call manager across all gRPC clients in the core worker process.
   /// This is used by the CoreWorker and the MetricsAgentClient.
   std::unique_ptr<rpc::ClientCallManager> client_call_manager_;
@@ -179,6 +188,9 @@ class CoreWorkerProcessImpl {
   // Thread that runs a boost::asio service to process IO events.
   boost::thread io_thread_;
 
+  /// Thread that drains object_freed_callback_service_.
+  boost::thread object_freed_callback_thread_;
+
   /// The core worker instance of this worker process.
   MutexProtected<std::shared_ptr<CoreWorker>> core_worker_;
 
diff --git a/src/ray/core_worker/core_worker_shutdown_executor.cc b/src/ray/core_worker/core_worker_shutdown_executor.cc
index d988d061c894..e4966fdbeef7 100644
--- a/src/ray/core_worker/core_worker_shutdown_executor.cc
+++ b/src/ray/core_worker/core_worker_shutdown_executor.cc
@@ -94,6 +94,12 @@ void CoreWorkerShutdownExecutor::ExecuteGracefulShutdown(
     }
   }
 
+  core_worker->object_freed_callback_service_.stop();
+  RAY_LOG(INFO) << "Waiting for joining the object-freed callback thread.";
+  if (core_worker->object_freed_callback_thread_.joinable()) {
+    core_worker->object_freed_callback_thread_.join();
+  }
+
   core_worker->core_worker_server_->Shutdown();
 
   // GCS client is safe to disconnect now that io_service has stopped.
diff --git a/src/ray/core_worker/tests/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc
index db8297d2a1f5..9fa24659347d 100644
--- a/src/ray/core_worker/tests/core_worker_test.cc
+++ b/src/ray/core_worker/tests/core_worker_test.cc
@@ -65,6 +65,8 @@ class CoreWorkerTest : public ::testing::Test {
   CoreWorkerTest()
       : io_work_(io_service_.get_executor()),
         task_execution_service_work_(task_execution_service_.get_executor()),
+        object_freed_callback_service_work_(
+            object_freed_callback_service_.get_executor()),
         current_time_ms_(0.0) {
     CoreWorkerOptions options;
     options.worker_type = WorkerType::WORKER;
@@ -258,6 +260,7 @@ class CoreWorkerTest : public ::testing::Test {
     core_worker_ = std::make_shared<CoreWorker>(std::move(options),
                                                 std::move(worker_context),
                                                 io_service_,
+                                                object_freed_callback_service_,
                                                 std::move(core_worker_client_pool),
                                                 std::move(raylet_client_pool),
                                                 std::move(periodical_runner),
@@ -267,6 +270,7 @@ class CoreWorkerTest : public ::testing::Test {
                                                 std::move(fake_raylet_ipc_client),
                                                 std::move(fake_local_raylet_rpc_client),
                                                 io_thread_,
+                                                object_freed_callback_thread_,
                                                 reference_counter_,
                                                 memory_store_,
                                                 nullptr,  // plasma_store_provider_
@@ -291,11 +295,19 @@ class CoreWorkerTest : public ::testing::Test {
  protected:
   instrumented_io_context io_service_;
   instrumented_io_context task_execution_service_;
+  instrumented_io_context object_freed_callback_service_;
   boost::asio::executor_work_guard<boost::asio::io_context::executor_type> io_work_;
   boost::asio::executor_work_guard<boost::asio::io_context::executor_type>
       task_execution_service_work_;
+  boost::asio::executor_work_guard<boost::asio::io_context::executor_type>
+      object_freed_callback_service_work_;
 
   boost::thread io_thread_;
+  boost::thread object_freed_callback_thread_;
+
+  /// Flush all pending object-freed callbacks. Call this in tests after an action
+  /// that should trigger a user-registered out-of-scope callback.
+  void FlushObjectFreedCallbacks() { object_freed_callback_service_.poll(); }
 
   rpc::Address rpc_address_;
   std::unique_ptr<rpc::ClientCallManager> client_call_manager_;
@@ -1310,5 +1322,138 @@ INSTANTIATE_TEST_SUITE_P(ActorRefDeletedForRegisteringActor,
                          HandleWaitForActorRefDeletedWhileRegisteringRetriesTest,
                          ::testing::Values(true, false));
 
+// Callback fires after the last local reference is dropped, and
+// FlushObjectFreedCallbacks drains the pending work.
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_FiresAfterRefDrop) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  bool fired = false;
+  ObjectID received_id;
+  bool registered = core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id, [&fired, &received_id](const ObjectID &id) {
+        fired = true;
+        received_id = id;
+      });
+  ASSERT_TRUE(registered);
+  ASSERT_FALSE(fired);
+
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+  // Callback is posted to the dedicated service; flush it synchronously.
+  FlushObjectFreedCallbacks();
+  ASSERT_TRUE(fired);
+  EXPECT_EQ(received_id, object_id);
+}
+
+// Returns false when the object is already out of scope; callback never fires.
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_ReturnsFalseWhenAlreadyOutOfScope) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  // Add and immediately remove the reference so it goes out of scope.
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+
+  bool fired = false;
+  bool registered = core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id, [&fired](const ObjectID &) { fired = true; });
+  ASSERT_FALSE(registered);
+  FlushObjectFreedCallbacks();
+  ASSERT_FALSE(fired);
+}
+
+// The callback must run on the dedicated object_freed_callback_service_ thread,
+// not on the IO thread or the test thread.
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_RunsOnDedicatedThread) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  std::promise<boost::thread::id> thread_id_promise;
+  bool registered = core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id, [&thread_id_promise](const ObjectID &) {
+        thread_id_promise.set_value(boost::this_thread::get_id());
+      });
+  ASSERT_TRUE(registered);
+
+  // RemoveLocalReference fires OnObjectOutOfScopeOrFreed inline (test thread), which
+  // calls the wrapped lambda that posts the real callback to
+  // object_freed_callback_service_.
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+
+  // Start the dedicated thread so the posted work can run.
+  object_freed_callback_thread_ =
+      boost::thread([this]() { object_freed_callback_service_.run(); });
+
+  auto tid = thread_id_promise.get_future().get();
+
+  object_freed_callback_service_.stop();
+  if (object_freed_callback_thread_.joinable()) {
+    object_freed_callback_thread_.join();
+  }
+
+  EXPECT_EQ(tid, object_freed_callback_thread_.get_id())
+      << "Callback must run on object_freed_callback_thread_";
+  EXPECT_NE(tid, boost::this_thread::get_id())
+      << "Callback must not run on the test thread";
+}
+
+// The C function-pointer overload (used by Cython) routes through the same
+// dedicated thread and delivers the correct object_id + user_data.
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_CFunctionPointerOverload) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  struct Result {
+    ObjectID id;
+    bool fired = false;
+  } result;
+
+  auto c_callback = [](const ObjectID &id, void *data) {
+    auto *r = static_cast<Result *>(data);
+    r->id = id;
+    r->fired = true;
+  };
+
+  bool registered =
+      core_worker_->AddObjectOutOfScopeOrFreedCallback(object_id, c_callback, &result);
+  ASSERT_TRUE(registered);
+
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+  FlushObjectFreedCallbacks();
+
+  ASSERT_TRUE(result.fired);
+  EXPECT_EQ(result.id, object_id);
+}
+
 }  // namespace core
 }  // namespace ray

From 5c3780a2ca6c55ca176b51f2daaccb72af64ff85 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 16:27:13 -0700
Subject: [PATCH 02/53] cython layer changes

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                        |  35 ++++
 python/ray/includes/libcoreworker.pxd         |   4 +
 .../test_object_out_of_scope_callback.py      | 183 ++++++++++++++++++
 3 files changed, 222 insertions(+)
 create mode 100644 python/ray/tests/test_object_out_of_scope_callback.py

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 0933767614ad..dbea9a8da3de 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2761,6 +2761,22 @@ cdef class GcsClient:
                 ray._private.utils._CALLED_FREQ[name] += 1
         return getattr(self.inner, name)
 
+# Invoked by the dedicated object_freed_callback_service_ thread when an object
+# goes out of scope. Acquires the GIL, calls the Python callable stored as
+# user_data, then decrements the refcount taken in add_object_out_of_scope_callback.
+cdef void _invoke_object_out_of_scope_callback(
+        const CObjectID &c_object_id, void *user_data) noexcept nogil:
+    with gil:
+        object_ref_id = ObjectRef(c_object_id.Binary())
+        try:
+            (<object>user_data)(object_ref_id)
+        except Exception:
+            # Exceptions in these callbacks cannot propagate to the C++ caller.
+            pass
+        finally:
+            cpython.Py_DECREF(<object>user_data)
+
+
 cdef class CoreWorker:
 
     def __cinit__(self, worker_type, store_socket, raylet_socket,
@@ -4102,6 +4118,25 @@ cdef class CoreWorker:
             CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference(
                 c_object_id)
 
+    def add_object_out_of_scope_callback(self, ObjectRef object_ref, callback):
+        """Register a Python callable to fire when object_ref goes out of scope.
+
+        Returns True if registered; False if the object is already out of scope
+        (the callback will never fire and should be discarded).
+        """
+        cdef CObjectID c_object_id = object_ref.native()
+        # Keep the callable alive until the C++ callback fires (or never fires).
+        cpython.Py_INCREF(callback)
+        registered = CCoreWorkerProcess.GetCoreWorker() \
+            .AddObjectOutOfScopeOrFreedCallback(
+                c_object_id,
+                _invoke_object_out_of_scope_callback,
+                <void *>callback)
+        if not registered:
+            # Callback will never fire; balance the Py_INCREF we just took.
+            cpython.Py_DECREF(callback)
+        return registered
+
     def get_owner_address(self, ObjectRef object_ref):
         cdef:
             CObjectID c_object_id = object_ref.native()
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 60a99d85f85a..796faf1496c4 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -235,6 +235,10 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
             c_bool all_namespaces)
         void AddLocalReference(const CObjectID &object_id)
         void RemoveLocalReference(const CObjectID &object_id)
+        c_bool AddObjectOutOfScopeOrFreedCallback(
+            const CObjectID &object_id,
+            void (*callback)(const CObjectID &, void *) nogil,
+            void *user_data)
         void PutObjectIntoPlasma(const CRayObject &object,
                                  const CObjectID &object_id)
         const CAddress &GetRpcAddress() const
diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
new file mode 100644
index 000000000000..7555c070479b
--- /dev/null
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -0,0 +1,183 @@
+# These tests require a live Ray cluster because the callback path goes through
+# the C++ ReferenceCounter and the dedicated object_freed_callback_service_ thread.
+
+import threading
+import time
+
+import pytest
+
+import ray
+import ray._private.worker as worker_module
+
+
+@pytest.fixture(scope="module")
+def ray_instance():
+    ray.init(num_cpus=1)
+    yield
+    ray.shutdown()
+
+
+def _core_worker():
+    return worker_module.global_worker.core_worker
+
+
+class TestAddObjectOutOfScopeCallback:
+    """Integration tests against a live Ray cluster."""
+
+    def test_callback_fires_when_ref_dropped(self, ray_instance):
+        """Callback fires with the correct ObjectRef once the last reference is dropped."""
+        received_id = []
+        done = threading.Event()
+
+        ref = ray.put(42)
+        expected_id = ref
+        registered = _core_worker().add_object_out_of_scope_callback(
+            ref, lambda r: (received_id.append(r), done.set())
+        )
+        assert registered, "Expected registration to succeed"
+
+        del ref
+        assert done.wait(timeout=5), "Callback did not fire within 5 s"
+        assert received_id[0] == expected_id
+
+    def test_returns_false_for_already_out_of_scope(self, ray_instance):
+        """Returns False when the object is already out of scope; callback never fires."""
+        ref = ray.put(99)
+        # Force the RC entry gone while keeping the Python ObjectRef alive.
+        ray.internal.free([ref])
+
+        # Poll with a no-op until the RC entry is gone (ray.internal.free is async).
+        deadline = time.monotonic() + 5
+        registered = True
+        while time.monotonic() < deadline:
+            registered = _core_worker().add_object_out_of_scope_callback(
+                ref, lambda _: None
+            )
+            if not registered:
+                break
+            time.sleep(0.01)
+        assert not registered, "Object never became out of scope within 5 s"
+
+        # Now that we know the object is OOS, attempt registration with a real
+        # callback — must return False — then use a sentinel to drain the callback
+        # thread before asserting fired is still unset.
+        fired = threading.Event()
+        _core_worker().add_object_out_of_scope_callback(ref, lambda _: fired.set())
+
+        sentinel_done = threading.Event()
+        sentinel = ray.put("sentinel")
+        _core_worker().add_object_out_of_scope_callback(
+            sentinel, lambda _: sentinel_done.set()
+        )
+        del sentinel
+        assert sentinel_done.wait(timeout=5)
+        assert not fired.is_set(), "Callback must not fire for an already-freed object"
+
+    def test_callback_runs_on_non_driver_thread(self, ray_instance):
+        """Callback must run on the dedicated callback thread, not the driver thread."""
+        driver_thread_id = threading.get_ident()
+        callback_thread_ids = []
+        done = threading.Event()
+
+        ref = ray.put("thread_check")
+
+        def cb(_):
+            callback_thread_ids.append(threading.get_ident())
+            done.set()
+
+        _core_worker().add_object_out_of_scope_callback(ref, cb)
+        del ref
+        assert done.wait(timeout=5)
+        assert (
+            callback_thread_ids[0] != driver_thread_id
+        ), "Callback must not run on the driver thread"
+
+    def test_multiple_callbacks_all_fire(self, ray_instance):
+        """Multiple independent callbacks on the same object all fire."""
+        n = 3
+        counter = [0]
+        lock = threading.Lock()
+        done = threading.Event()
+
+        ref = ray.put("multi")
+
+        def make_cb():
+            def cb(_):
+                with lock:
+                    counter[0] += 1
+                    if counter[0] == n:
+                        done.set()
+
+            return cb
+
+        for _ in range(n):
+            _core_worker().add_object_out_of_scope_callback(ref, make_cb())
+
+        del ref
+        assert done.wait(timeout=5), f"Only {counter[0]}/{n} callbacks fired"
+
+    def test_ray_internal_free_triggers_callback(self, ray_instance):
+        """`ray.internal.free` should trigger the callback."""
+        fired = threading.Event()
+
+        ref = ray.put("free_me")
+        registered = _core_worker().add_object_out_of_scope_callback(
+            ref, lambda _: fired.set()
+        )
+        assert registered
+
+        ray.internal.free([ref])
+        assert fired.wait(timeout=5), "Callback did not fire after ray.internal.free"
+
+    def test_callback_exception_does_not_crash(self, ray_instance):
+        """A Python exception inside the callback must not propagate to C++."""
+        second_fired = threading.Event()
+
+        ref = ray.put("exc_test")
+
+        def bad_cb(_):
+            raise RuntimeError("intentional error in callback")
+
+        _core_worker().add_object_out_of_scope_callback(ref, bad_cb)
+        _core_worker().add_object_out_of_scope_callback(
+            ref, lambda _: second_fired.set()
+        )
+
+        del ref
+        assert second_fired.wait(
+            timeout=5
+        ), "Exception in one callback must not prevent subsequent callbacks"
+
+    def test_callback_fires_exactly_once(self, ray_instance):
+        """The callback fires exactly once."""
+        count = [0]
+        lock = threading.Lock()
+        done = threading.Event()
+
+        ref = ray.put("once")
+
+        def cb(_):
+            with lock:
+                count[0] += 1
+            done.set()
+
+        _core_worker().add_object_out_of_scope_callback(ref, cb)
+        del ref
+        assert done.wait(timeout=5)
+
+        # Drain the callback thread via a sentinel: any spurious second fire would
+        # arrive before the sentinel, so if count is still 1 after this, we're clean.
+        sentinel_done = threading.Event()
+        sentinel = ray.put("sentinel")
+        _core_worker().add_object_out_of_scope_callback(
+            sentinel, lambda _: sentinel_done.set()
+        )
+        del sentinel
+        assert sentinel_done.wait(timeout=5)
+        assert count[0] == 1, f"Expected exactly 1 callback, got {count[0]}"
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main([__file__, "-v"]))

From eb668f86f5cb650ff246144860b08a69e51bd480 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 17:48:49 -0700
Subject: [PATCH 03/53] Address reviews

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                                | 11 +++++++----
 python/ray/tests/test_object_out_of_scope_callback.py |  5 ++++-
 src/ray/core_worker/core_worker.cc                    |  9 +++++----
 src/ray/core_worker/core_worker_shutdown_executor.cc  |  6 +++++-
 src/ray/core_worker/tests/core_worker_test.cc         |  4 ++--
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index dbea9a8da3de..7419835518e7 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2767,12 +2767,11 @@ cdef class GcsClient:
 cdef void _invoke_object_out_of_scope_callback(
         const CObjectID &c_object_id, void *user_data) noexcept nogil:
     with gil:
-        object_ref_id = ObjectRef(c_object_id.Binary())
         try:
+            object_ref_id = ObjectRef(c_object_id.Binary(), weak_ref=True)
             (<object>user_data)(object_ref_id)
-        except Exception:
-            # Exceptions in these callbacks cannot propagate to the C++ caller.
-            pass
+        except BaseException:
+            logger.exception("Error in object out-of-scope callback")
         finally:
             cpython.Py_DECREF(<object>user_data)
 
@@ -4124,6 +4123,10 @@ cdef class CoreWorker:
         Returns True if registered; False if the object is already out of scope
         (the callback will never fire and should be discarded).
         """
+        if not callable(callback):
+            raise TypeError(
+                f"callback must be callable, got {type(callback).__name__!r}"
+            )
         cdef CObjectID c_object_id = object_ref.native()
         # Keep the callable alive until the C++ callback fires (or never fires).
         cpython.Py_INCREF(callback)
diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
index 7555c070479b..07ee076db2ec 100644
--- a/python/ray/tests/test_object_out_of_scope_callback.py
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -62,7 +62,10 @@ def test_returns_false_for_already_out_of_scope(self, ray_instance):
         # callback — must return False — then use a sentinel to drain the callback
         # thread before asserting fired is still unset.
         fired = threading.Event()
-        _core_worker().add_object_out_of_scope_callback(ref, lambda _: fired.set())
+        registered2 = _core_worker().add_object_out_of_scope_callback(
+            ref, lambda _: fired.set()
+        )
+        assert not registered2, "Second registration on OOS object must return False"
 
         sentinel_done = threading.Event()
         sentinel = ray.put("sentinel")
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index a3a30ba6e788..c67bc31c071b 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -2483,10 +2483,10 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
     const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback) {
   // Wrap so the actual callback runs on the dedicated thread.
   // The wrapper itself is quick (just a post) and safe to call under the
-  // ReferenceCounter mutex.
-  auto wrapped = [this, callback](const ObjectID &id) {
-    object_freed_callback_service_.post([callback, id]() { callback(id); },
-                                        "CoreWorker.ObjFreedCb");
+  // ReferenceCounter mutex. Capture the service by reference (not `this`) so
+  // the lambda remains safe if CoreWorker is destroyed before the RC calls it.
+  auto wrapped = [&svc = object_freed_callback_service_, callback](const ObjectID &id) {
+    svc.post([callback, id]() { callback(id); }, "CoreWorker.ObjFreedCb");
   };
   return reference_counter_->AddObjectOutOfScopeOrFreedCallback(object_id, wrapped);
 }
@@ -2495,6 +2495,7 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                                     void (*callback)(const ObjectID &,
                                                                      void *),
                                                     void *user_data) {
+  RAY_CHECK(callback != nullptr) << "callback must not be null";
   return AddObjectOutOfScopeOrFreedCallback(
       object_id, [callback, user_data](const ObjectID &id) { callback(id, user_data); });
 }
diff --git a/src/ray/core_worker/core_worker_shutdown_executor.cc b/src/ray/core_worker/core_worker_shutdown_executor.cc
index e4966fdbeef7..11a0a43fb92e 100644
--- a/src/ray/core_worker/core_worker_shutdown_executor.cc
+++ b/src/ray/core_worker/core_worker_shutdown_executor.cc
@@ -94,7 +94,11 @@ void CoreWorkerShutdownExecutor::ExecuteGracefulShutdown(
     }
   }
 
-  core_worker->object_freed_callback_service_.stop();
+  // Post stop() as a handler so it runs after all pending Py_DECREF callbacks
+  // have executed — avoids leaking Python refcounts if callbacks are in flight.
+  core_worker->object_freed_callback_service_.post(
+      [&svc = core_worker->object_freed_callback_service_]() { svc.stop(); },
+      "CoreWorker.StopCallbackService");
   RAY_LOG(INFO) << "Waiting for joining the object-freed callback thread.";
   if (core_worker->object_freed_callback_thread_.joinable()) {
     core_worker->object_freed_callback_thread_.join();
diff --git a/src/ray/core_worker/tests/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc
index 9fa24659347d..943ca34e3550 100644
--- a/src/ray/core_worker/tests/core_worker_test.cc
+++ b/src/ray/core_worker/tests/core_worker_test.cc
@@ -1407,14 +1407,14 @@ TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_RunsOnDedicatedThread) {
       boost::thread([this]() { object_freed_callback_service_.run(); });
 
   auto tid = thread_id_promise.get_future().get();
+  auto expected_tid = object_freed_callback_thread_.get_id();
 
   object_freed_callback_service_.stop();
   if (object_freed_callback_thread_.joinable()) {
     object_freed_callback_thread_.join();
   }
 
-  EXPECT_EQ(tid, object_freed_callback_thread_.get_id())
-      << "Callback must run on object_freed_callback_thread_";
+  EXPECT_EQ(tid, expected_tid) << "Callback must run on object_freed_callback_thread_";
   EXPECT_NE(tid, boost::this_thread::get_id())
       << "Callback must not run on the test thread";
 }

From 711d5a3d2ee611d3803a12cd8242e392060d9498 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 17:57:07 -0700
Subject: [PATCH 04/53] Fix ObjectRef paramemter

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 7419835518e7..bd52f648baaa 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2768,7 +2768,7 @@ cdef void _invoke_object_out_of_scope_callback(
         const CObjectID &c_object_id, void *user_data) noexcept nogil:
     with gil:
         try:
-            object_ref_id = ObjectRef(c_object_id.Binary(), weak_ref=True)
+            object_ref_id = ObjectRef(c_object_id.Binary(), skip_adding_local_ref=True)
             (<object>user_data)(object_ref_id)
         except BaseException:
             logger.exception("Error in object out-of-scope callback")

From 06a657d06091e97d794c0248337554bc2eea15a0 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 18:05:53 -0700
Subject: [PATCH 05/53] Set in_core_worker to avoid void decrement of objectref
 refcount

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index bd52f648baaa..0a26740d3f4e 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2769,6 +2769,11 @@ cdef void _invoke_object_out_of_scope_callback(
     with gil:
         try:
             object_ref_id = ObjectRef(c_object_id.Binary(), skip_adding_local_ref=True)
+            # skip_adding_local_ref suppresses add_object_ref_reference, but
+            # __init__ still sets in_core_worker=True so __dealloc__ would call
+            # remove_object_ref_reference without a matching add. Clear it so
+            # the temporary ref is truly no-op on the refcount.
+            object_ref_id.in_core_worker = False
             (<object>user_data)(object_ref_id)
         except BaseException:
             logger.exception("Error in object out-of-scope callback")

From c534154307c0d00d447ea352ae5e72562eb802ba Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 10 Jun 2026 19:02:53 -0700
Subject: [PATCH 06/53] AddObjectOutOfScopeOrFreedCallback made public

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 src/ray/core_worker/core_worker.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index de924e3a0f0f..f3e4e69f22f4 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -401,6 +401,21 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
     memory_store_->Delete(deleted);
   }
 
+  /// Register a callback to fire when an object goes out of scope or is freed.
+  ///
+  /// The callback is posted to the dedicated object_freed_callback_service_ thread
+  /// so it never blocks the main IO thread.
+  ///
+  /// \return true if the callback was registered; false if the object is already
+  ///         out of scope or was explicitly freed (callback will never fire).
+  bool AddObjectOutOfScopeOrFreedCallback(
+      const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback);
+
+  /// C function-pointer overload for use from Cython.
+  bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
+                                          void (*callback)(const ObjectID &, void *),
+                                          void *user_data);
+
   int GetMemoryStoreSize() { return memory_store_->Size(); }
 
   /// Returns a map of all ObjectIDs currently in scope with a pair of their
@@ -1484,21 +1499,6 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
     reference_counter_->AddLocalReference(object_id, call_site);
   }
 
-  /// Register a callback to fire when an object goes out of scope or is freed.
-  ///
-  /// The callback is posted to the dedicated object_freed_callback_service_ thread
-  /// so it never blocks the main IO thread.
-  ///
-  /// \return true if the callback was registered; false if the object is already
-  ///         out of scope or was explicitly freed (callback will never fire).
-  bool AddObjectOutOfScopeOrFreedCallback(
-      const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback);
-
-  /// C function-pointer overload for use from Cython.
-  bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
-                                          void (*callback)(const ObjectID &, void *),
-                                          void *user_data);
-
   /// Stops the children tasks from the given TaskID
   ///
   /// \param[in] task_id of the parent task

From 5e624fe09b474f10bea7677eb32b5982e6e4ab73 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 11 Jun 2026 12:58:24 -0700
Subject: [PATCH 07/53] Add test to bazel file

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/tests/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/tests/BUILD.bazel b/python/ray/tests/BUILD.bazel
index 4687916378f9..4005f317239b 100644
--- a/python/ray/tests/BUILD.bazel
+++ b/python/ray/tests/BUILD.bazel
@@ -560,6 +560,7 @@ py_test_module_list(
         "test_multinode_failures_2.py",
         "test_node_death.py",
         "test_numba.py",
+        "test_object_out_of_scope_callback.py",
         "test_object_spilling_no_asan.py",
         "test_open_telemetry_metric_recorder.py",
         "test_placement_group_metrics.py",

From 8f422f7a7137c773a8002928bbe7e06807eeba83 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 11 Jun 2026 15:30:58 -0700
Subject: [PATCH 08/53] Fix test

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/tests/test_object_out_of_scope_callback.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
index 07ee076db2ec..2e7f34787570 100644
--- a/python/ray/tests/test_object_out_of_scope_callback.py
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -30,15 +30,15 @@ def test_callback_fires_when_ref_dropped(self, ray_instance):
         done = threading.Event()
 
         ref = ray.put(42)
-        expected_id = ref
+        expected_binary = ref.binary()
         registered = _core_worker().add_object_out_of_scope_callback(
             ref, lambda r: (received_id.append(r), done.set())
         )
         assert registered, "Expected registration to succeed"
 
-        del ref
+        del ref  # drops the last Python reference — callback must now fire
         assert done.wait(timeout=5), "Callback did not fire within 5 s"
-        assert received_id[0] == expected_id
+        assert received_id[0].binary() == expected_binary
 
     def test_returns_false_for_already_out_of_scope(self, ray_instance):
         """Returns False when the object is already out of scope; callback never fires."""

From 33ed99148a7a0254f0a364ad217335b4cce92148 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 11 Jun 2026 17:08:51 -0700
Subject: [PATCH 09/53] Address Shutdown edge case

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                        | 27 +++++----
 python/ray/includes/libcoreworker.pxd         |  3 +-
 src/ray/core_worker/core_worker.cc            | 10 +++-
 src/ray/core_worker/core_worker.h             | 14 ++++-
 src/ray/core_worker/tests/core_worker_test.cc | 57 +++++++++++++++++++
 5 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 0a26740d3f4e..467361502c7a 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2762,8 +2762,9 @@ cdef class GcsClient:
         return getattr(self.inner, name)
 
 # Invoked by the dedicated object_freed_callback_service_ thread when an object
-# goes out of scope. Acquires the GIL, calls the Python callable stored as
-# user_data, then decrements the refcount taken in add_object_out_of_scope_callback.
+# goes out of scope. Acquires the GIL and calls the Python callable stored as
+# user_data. The matching Py_DECREF is handled by _drop_user_data when the C++
+# lambda is destroyed.
 cdef void _invoke_object_out_of_scope_callback(
         const CObjectID &c_object_id, void *user_data) noexcept nogil:
     with gil:
@@ -2777,8 +2778,14 @@ cdef void _invoke_object_out_of_scope_callback(
             (<object>user_data)(object_ref_id)
         except BaseException:
             logger.exception("Error in object out-of-scope callback")
-        finally:
-            cpython.Py_DECREF(<object>user_data)
+
+
+# Called by C++ when the out-of-scope lambda is destroyed, on both the normal
+# (callback fired) and the shutdown (lambda dropped) paths. Balances the
+# Py_INCREF taken in add_object_out_of_scope_callback.
+cdef void _drop_user_data(void *user_data) noexcept nogil:
+    with gil:
+        cpython.Py_DECREF(<object>user_data)
 
 
 cdef class CoreWorker:
@@ -4133,17 +4140,15 @@ cdef class CoreWorker:
                 f"callback must be callable, got {type(callback).__name__!r}"
             )
         cdef CObjectID c_object_id = object_ref.native()
-        # Keep the callable alive until the C++ callback fires (or never fires).
+        # Keep the callable alive; _drop_user_data balances this Py_INCREF when
+        # the C++ lambda is destroyed (fired or dropped at shutdown).
         cpython.Py_INCREF(callback)
-        registered = CCoreWorkerProcess.GetCoreWorker() \
+        return CCoreWorkerProcess.GetCoreWorker() \
             .AddObjectOutOfScopeOrFreedCallback(
                 c_object_id,
                 _invoke_object_out_of_scope_callback,
-                <void *>callback)
-        if not registered:
-            # Callback will never fire; balance the Py_INCREF we just took.
-            cpython.Py_DECREF(callback)
-        return registered
+                <void *>callback,
+                _drop_user_data)
 
     def get_owner_address(self, ObjectRef object_ref):
         cdef:
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 796faf1496c4..f39abdc65d3f 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -238,7 +238,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
         c_bool AddObjectOutOfScopeOrFreedCallback(
             const CObjectID &object_id,
             void (*callback)(const CObjectID &, void *) nogil,
-            void *user_data)
+            void *user_data,
+            void (*on_drop)(void *) nogil)
         void PutObjectIntoPlasma(const CRayObject &object,
                                  const CObjectID &object_id)
         const CAddress &GetRpcAddress() const
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index c67bc31c071b..019c69ad444a 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -2494,10 +2494,16 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
 bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                                     void (*callback)(const ObjectID &,
                                                                      void *),
-                                                    void *user_data) {
+                                                    void *user_data,
+                                                    void (*on_drop)(void *)) {
   RAY_CHECK(callback != nullptr) << "callback must not be null";
+  // Wrap user_data in a shared_ptr so on_drop is called when the lambda is
+  // destroyed — whether the callback fired normally or was dropped at shutdown.
+  auto owned = std::shared_ptr<void>(user_data, [on_drop](void *p) {
+    if (on_drop) on_drop(p);
+  });
   return AddObjectOutOfScopeOrFreedCallback(
-      object_id, [callback, user_data](const ObjectID &id) { callback(id, user_data); });
+      object_id, [callback, owned](const ObjectID &id) { callback(id, owned.get()); });
 }
 
 Status CoreWorker::CancelChildren(const TaskID &task_id, bool force_kill) {
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index f3e4e69f22f4..6720f9fdf09c 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -412,9 +412,21 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
       const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback);
 
   /// C function-pointer overload for use from Cython.
+  ///
+  /// \param object_id The object to watch.
+  /// \param callback Invoked with (object_id, user_data) when the object goes
+  ///        out of scope.
+  /// \param user_data Passed as the second argument to callback.
+  /// \param on_drop If non-null, called with user_data when the internal lambda
+  ///        is destroyed. Necessary because destroying a raw void* is a no-op,
+  ///        so on_drop is the only way to release resources (e.g. Py_DECREF)
+  ///        on both the normal and shutdown (RC teardown) paths.
+  /// \return true if registered; false if the object is already out of scope
+  ///         (callback will never fire).
   bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                           void (*callback)(const ObjectID &, void *),
-                                          void *user_data);
+                                          void *user_data,
+                                          void (*on_drop)(void *) = nullptr);
 
   int GetMemoryStoreSize() { return memory_store_->Size(); }
 
diff --git a/src/ray/core_worker/tests/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc
index 943ca34e3550..2f2f580b38c9 100644
--- a/src/ray/core_worker/tests/core_worker_test.cc
+++ b/src/ray/core_worker/tests/core_worker_test.cc
@@ -1455,5 +1455,62 @@ TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_CFunctionPointerOverload) {
   EXPECT_EQ(result.id, object_id);
 }
 
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_OnDropFiresWithCallback) {
+  // on_drop must fire on the normal path (after the callback is invoked and
+  // the internal lambda is destroyed).
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  struct Flags {
+    bool callback_fired = false;
+    bool drop_fired = false;
+  } flags;
+  ASSERT_TRUE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id,
+      [](const ObjectID &, void *data) {
+        static_cast<Flags *>(data)->callback_fired = true;
+      },
+      &flags,
+      [](void *data) { static_cast<Flags *>(data)->drop_fired = true; }));
+
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+  FlushObjectFreedCallbacks();
+
+  EXPECT_TRUE(flags.callback_fired);
+  EXPECT_TRUE(flags.drop_fired);
+}
+
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_OnDropFiresWhenNotRegistered) {
+  // on_drop must fire synchronously when registration fails (object already out
+  // of scope), so callers can unconditionally rely on on_drop to release user_data.
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+
+  bool drop_fired = false;
+  ASSERT_FALSE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id,
+      [](const ObjectID &, void *) {},
+      &drop_fired,
+      [](void *data) { *static_cast<bool *>(data) = true; }));
+  EXPECT_TRUE(drop_fired);
+}
+
 }  // namespace core
 }  // namespace ray

From 1adeed416c0aa7b28d6a9f6ecf385a7346744a4c Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 15 Jun 2026 12:15:30 -0700
Subject: [PATCH 10/53] Address reviews

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                        | 91 ++++++++++++++-----
 python/ray/includes/libcoreworker.pxd         |  2 +-
 .../test_object_out_of_scope_callback.py      | 78 +---------------
 src/ray/core_worker/core_worker.cc            | 20 ++--
 src/ray/core_worker/core_worker.h             | 41 +++++----
 src/ray/core_worker/core_worker_process.cc    |  2 +-
 src/ray/core_worker/tests/core_worker_test.cc | 47 ++++++++++
 7 files changed, 154 insertions(+), 127 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 467361502c7a..eaab79ca3169 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2761,31 +2761,46 @@ cdef class GcsClient:
                 ray._private.utils._CALLED_FREQ[name] += 1
         return getattr(self.inner, name)
 
-# Invoked by the dedicated object_freed_callback_service_ thread when an object
-# goes out of scope. Acquires the GIL and calls the Python callable stored as
-# user_data. The matching Py_DECREF is handled by _drop_user_data when the C++
-# lambda is destroyed.
+# Module-level dict: object_id_binary (bytes) -> Python callable.
+# Keeps each registered callback alive while its C++ lambda is pending.
+# There is one CoreWorker per process, so module scope is sufficient.
+_object_out_of_scope_callbacks: dict = {}
+
+
 cdef void _invoke_object_out_of_scope_callback(
-        const CObjectID &c_object_id, void *user_data) noexcept nogil:
+        const CObjectID &c_object_id, void *callback_context) noexcept nogil:
+    """Invoked on the object_freed_callback_service_ thread when an object goes
+    out of scope. Looks up and calls the registered Python callback for the
+    object. The callback receives the object ID as ``bytes``.
+
+    Args:
+        c_object_id: The C++ ObjectID of the object that went out of scope.
+        callback_context: A ``bytes`` object (the object ID binary) used as the
+            key into ``_object_out_of_scope_callbacks``. Its lifetime is
+            managed by ``_release_callback_context``.
+    """
     with gil:
         try:
-            object_ref_id = ObjectRef(c_object_id.Binary(), skip_adding_local_ref=True)
-            # skip_adding_local_ref suppresses add_object_ref_reference, but
-            # __init__ still sets in_core_worker=True so __dealloc__ would call
-            # remove_object_ref_reference without a matching add. Clear it so
-            # the temporary ref is truly no-op on the refcount.
-            object_ref_id.in_core_worker = False
-            (<object>user_data)(object_ref_id)
+            id_binary = <bytes>callback_context
+            callback = _object_out_of_scope_callbacks.get(id_binary)
+            if callback is not None:
+                callback(id_binary)
         except BaseException:
             logger.exception("Error in object out-of-scope callback")
 
 
-# Called by C++ when the out-of-scope lambda is destroyed, on both the normal
-# (callback fired) and the shutdown (lambda dropped) paths. Balances the
-# Py_INCREF taken in add_object_out_of_scope_callback.
-cdef void _drop_user_data(void *user_data) noexcept nogil:
+cdef void _release_callback_context(void *callback_context) noexcept nogil:
+    """Called by C++ when the out-of-scope lambda is destroyed.
+
+    Fires on both the normal (callback invoked) and shutdown (lambda dropped)
+    paths. Removes the callback from ``_object_out_of_scope_callbacks`` and
+    releases the Py_INCREF taken on the ``bytes`` key in
+    ``add_object_out_of_scope_callback``.
+    """
     with gil:
-        cpython.Py_DECREF(<object>user_data)
+        id_binary = <bytes>callback_context
+        _object_out_of_scope_callbacks.pop(id_binary, None)
+        cpython.Py_DECREF(<object>callback_context)
 
 
 cdef class CoreWorker:
@@ -4132,23 +4147,49 @@ cdef class CoreWorker:
     def add_object_out_of_scope_callback(self, ObjectRef object_ref, callback):
         """Register a Python callable to fire when object_ref goes out of scope.
 
-        Returns True if registered; False if the object is already out of scope
-        (the callback will never fire and should be discarded).
+        .. warning::
+            This is an internal Ray API. Do not use it outside of Ray libraries.
+
+        Can only be called on the worker that owns object_ref. Raises
+        ValueError if object_ref is not owned by this worker.
+
+        Args:
+            object_ref: The owned object to watch.
+            callback: Called with the object ID as ``bytes`` when the last
+                reference is released. Must be callable.
+
+        Returns:
+            True if registered; False if the object is already out of scope
+            (the callback will never fire).
         """
         if not callable(callback):
             raise TypeError(
                 f"callback must be callable, got {type(callback).__name__!r}"
             )
-        cdef CObjectID c_object_id = object_ref.native()
-        # Keep the callable alive; _drop_user_data balances this Py_INCREF when
-        # the C++ lambda is destroyed (fired or dropped at shutdown).
-        cpython.Py_INCREF(callback)
+        cdef:
+            CObjectID c_object_id = object_ref.native()
+            CAddress c_owner_address
+        op_status = CCoreWorkerProcess.GetCoreWorker().GetOwnerAddress(
+            c_object_id, &c_owner_address)
+        check_status(op_status)
+        this_worker_id = CCoreWorkerProcess.GetCoreWorker().GetWorkerID().Binary()
+        if c_owner_address.worker_id() != this_worker_id:
+            raise ValueError(
+                f"add_object_out_of_scope_callback can only be called for objects "
+                f"owned by this worker. Object {object_ref.hex()} is owned by worker "
+                f"{c_owner_address.worker_id().hex()}."
+            )
+        id_binary = object_ref.binary()
+        _object_out_of_scope_callbacks[id_binary] = callback
+        # id_binary is passed to C++ as a raw void*, so Py_INCREF is required to
+        # prevent GC from collecting it while C++ holds the pointer.
+        cpython.Py_INCREF(id_binary)
         return CCoreWorkerProcess.GetCoreWorker() \
             .AddObjectOutOfScopeOrFreedCallback(
                 c_object_id,
                 _invoke_object_out_of_scope_callback,
-                <void *>callback,
-                _drop_user_data)
+                <void *>id_binary,
+                _release_callback_context)
 
     def get_owner_address(self, ObjectRef object_ref):
         cdef:
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index f39abdc65d3f..71056d89b43e 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -238,7 +238,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
         c_bool AddObjectOutOfScopeOrFreedCallback(
             const CObjectID &object_id,
             void (*callback)(const CObjectID &, void *) nogil,
-            void *user_data,
+            void *callback_context,
             void (*on_drop)(void *) nogil)
         void PutObjectIntoPlasma(const CRayObject &object,
                                  const CObjectID &object_id)
diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
index 2e7f34787570..e6061bff6c43 100644
--- a/python/ray/tests/test_object_out_of_scope_callback.py
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -25,20 +25,21 @@ class TestAddObjectOutOfScopeCallback:
     """Integration tests against a live Ray cluster."""
 
     def test_callback_fires_when_ref_dropped(self, ray_instance):
-        """Callback fires with the correct ObjectRef once the last reference is dropped."""
+        """Callback fires with the correct object ID bytes once the last reference is
+        dropped."""
         received_id = []
         done = threading.Event()
 
         ref = ray.put(42)
         expected_binary = ref.binary()
         registered = _core_worker().add_object_out_of_scope_callback(
-            ref, lambda r: (received_id.append(r), done.set())
+            ref, lambda id_bytes: (received_id.append(id_bytes), done.set())
         )
         assert registered, "Expected registration to succeed"
 
         del ref  # drops the last Python reference — callback must now fire
         assert done.wait(timeout=5), "Callback did not fire within 5 s"
-        assert received_id[0].binary() == expected_binary
+        assert received_id[0] == expected_binary
 
     def test_returns_false_for_already_out_of_scope(self, ray_instance):
         """Returns False when the object is already out of scope; callback never fires."""
@@ -76,49 +77,6 @@ def test_returns_false_for_already_out_of_scope(self, ray_instance):
         assert sentinel_done.wait(timeout=5)
         assert not fired.is_set(), "Callback must not fire for an already-freed object"
 
-    def test_callback_runs_on_non_driver_thread(self, ray_instance):
-        """Callback must run on the dedicated callback thread, not the driver thread."""
-        driver_thread_id = threading.get_ident()
-        callback_thread_ids = []
-        done = threading.Event()
-
-        ref = ray.put("thread_check")
-
-        def cb(_):
-            callback_thread_ids.append(threading.get_ident())
-            done.set()
-
-        _core_worker().add_object_out_of_scope_callback(ref, cb)
-        del ref
-        assert done.wait(timeout=5)
-        assert (
-            callback_thread_ids[0] != driver_thread_id
-        ), "Callback must not run on the driver thread"
-
-    def test_multiple_callbacks_all_fire(self, ray_instance):
-        """Multiple independent callbacks on the same object all fire."""
-        n = 3
-        counter = [0]
-        lock = threading.Lock()
-        done = threading.Event()
-
-        ref = ray.put("multi")
-
-        def make_cb():
-            def cb(_):
-                with lock:
-                    counter[0] += 1
-                    if counter[0] == n:
-                        done.set()
-
-            return cb
-
-        for _ in range(n):
-            _core_worker().add_object_out_of_scope_callback(ref, make_cb())
-
-        del ref
-        assert done.wait(timeout=5), f"Only {counter[0]}/{n} callbacks fired"
-
     def test_ray_internal_free_triggers_callback(self, ray_instance):
         """`ray.internal.free` should trigger the callback."""
         fired = threading.Event()
@@ -151,34 +109,6 @@ def bad_cb(_):
             timeout=5
         ), "Exception in one callback must not prevent subsequent callbacks"
 
-    def test_callback_fires_exactly_once(self, ray_instance):
-        """The callback fires exactly once."""
-        count = [0]
-        lock = threading.Lock()
-        done = threading.Event()
-
-        ref = ray.put("once")
-
-        def cb(_):
-            with lock:
-                count[0] += 1
-            done.set()
-
-        _core_worker().add_object_out_of_scope_callback(ref, cb)
-        del ref
-        assert done.wait(timeout=5)
-
-        # Drain the callback thread via a sentinel: any spurious second fire would
-        # arrive before the sentinel, so if count is still 1 after this, we're clean.
-        sentinel_done = threading.Event()
-        sentinel = ray.put("sentinel")
-        _core_worker().add_object_out_of_scope_callback(
-            sentinel, lambda _: sentinel_done.set()
-        )
-        del sentinel
-        assert sentinel_done.wait(timeout=5)
-        assert count[0] == 1, f"Expected exactly 1 callback, got {count[0]}"
-
 
 if __name__ == "__main__":
     import sys
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 019c69ad444a..52fb34432f8a 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -2481,12 +2481,14 @@ bool CoreWorker::IsTaskCanceled(const TaskID &task_id) const {
 
 bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
     const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback) {
-  // Wrap so the actual callback runs on the dedicated thread.
-  // The wrapper itself is quick (just a post) and safe to call under the
-  // ReferenceCounter mutex. Capture the service by reference (not `this`) so
-  // the lambda remains safe if CoreWorker is destroyed before the RC calls it.
-  auto wrapped = [&svc = object_freed_callback_service_, callback](const ObjectID &id) {
-    svc.post([callback, id]() { callback(id); }, "CoreWorker.ObjFreedCb");
+  RAY_CHECK(HasOwner(object_id))
+      << "AddObjectOutOfScopeOrFreedCallback can only be called for objects owned by "
+         "this worker. Object: "
+      << object_id;
+  auto wrapped = [&object_freed_callback_service = object_freed_callback_service_,
+                  callback](const ObjectID &id) {
+    object_freed_callback_service.post([callback, id]() { callback(id); },
+                                       "CoreWorker.ObjFreedCb");
   };
   return reference_counter_->AddObjectOutOfScopeOrFreedCallback(object_id, wrapped);
 }
@@ -2494,12 +2496,12 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
 bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                                     void (*callback)(const ObjectID &,
                                                                      void *),
-                                                    void *user_data,
+                                                    void *callback_context,
                                                     void (*on_drop)(void *)) {
   RAY_CHECK(callback != nullptr) << "callback must not be null";
-  // Wrap user_data in a shared_ptr so on_drop is called when the lambda is
+  // Wrap callback_context in a shared_ptr so on_drop is called when the lambda is
   // destroyed — whether the callback fired normally or was dropped at shutdown.
-  auto owned = std::shared_ptr<void>(user_data, [on_drop](void *p) {
+  auto owned = std::shared_ptr<void>(callback_context, [on_drop](void *p) {
     if (on_drop) on_drop(p);
   });
   return AddObjectOutOfScopeOrFreedCallback(
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 6720f9fdf09c..725870b57c1c 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -402,30 +402,37 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   }
 
   /// Register a callback to fire when an object goes out of scope or is freed.
+  /// Can only be called for objects owned by this worker. The callback is posted
+  /// to the dedicated object_freed_callback_service_ thread so it never blocks
+  /// the main IO thread.
   ///
-  /// The callback is posted to the dedicated object_freed_callback_service_ thread
-  /// so it never blocks the main IO thread.
-  ///
-  /// \return true if the callback was registered; false if the object is already
-  ///         out of scope or was explicitly freed (callback will never fire).
+  /// \param[in] object_id The owned object to watch.
+  /// \param[in] callback Invoked with the object_id when the object goes out of scope.
+  /// \return true if registered; false if the object is already out of scope or freed
+  ///         (callback will never fire).
   bool AddObjectOutOfScopeOrFreedCallback(
       const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback);
 
-  /// C function-pointer overload for use from Cython.
-  ///
-  /// \param object_id The object to watch.
-  /// \param callback Invoked with (object_id, user_data) when the object goes
-  ///        out of scope.
-  /// \param user_data Passed as the second argument to callback.
-  /// \param on_drop If non-null, called with user_data when the internal lambda
-  ///        is destroyed. Necessary because destroying a raw void* is a no-op,
-  ///        so on_drop is the only way to release resources (e.g. Py_DECREF)
-  ///        on both the normal and shutdown (RC teardown) paths.
-  /// \return true if registered; false if the object is already out of scope
+  /// C function-pointer overload of AddObjectOutOfScopeOrFreedCallback for use
+  /// from Cython. Can only be called for objects owned by this worker.
+  ///
+  /// \param[in] object_id The owned object to watch.
+  /// \param[in] callback Function to invoke when the object goes out of scope. Called
+  ///            with (object_id, callback_context). Must remain valid until `on_drop`
+  ///            fires.
+  /// \param[in] callback_context Opaque pointer forwarded unchanged to `callback` and
+  ///            `on_drop`. In the Cython overload, this is a pointer to a Python `bytes`
+  ///            object containing the object ID binary, used as the key into the callback
+  ///            registry.
+  /// \param[in] on_drop Destructor for `callback_context`. If non-null, called when the
+  ///            internal lambda is destroyed, on both the normal (callback invoked) and
+  ///            shutdown (lambda dropped without firing) paths. Pass a function that
+  ///            releases any resources held by `callback_context`.
+  /// \return true if registered; false if the object is already out of scope or freed
   ///         (callback will never fire).
   bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                           void (*callback)(const ObjectID &, void *),
-                                          void *user_data,
+                                          void *callback_context,
                                           void (*on_drop)(void *) = nullptr);
 
   int GetMemoryStoreSize() { return memory_store_->Size(); }
diff --git a/src/ray/core_worker/core_worker_process.cc b/src/ray/core_worker/core_worker_process.cc
index 08613ce3ecbb..ce885ebbe5bd 100644
--- a/src/ray/core_worker/core_worker_process.cc
+++ b/src/ray/core_worker/core_worker_process.cc
@@ -204,7 +204,7 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
     sigaddset(&mask, SIGTERM);
     pthread_sigmask(SIG_BLOCK, &mask, nullptr);
 #endif
-    SetThreadName("worker.obj_freed_cb");
+    SetThreadName("worker.user_obj_freed_callback");
     object_freed_callback_service_.run();
     RAY_LOG(INFO) << "Object-freed callback service stopped.";
   });
diff --git a/src/ray/core_worker/tests/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc
index 2f2f580b38c9..b8c6b3346235 100644
--- a/src/ray/core_worker/tests/core_worker_test.cc
+++ b/src/ray/core_worker/tests/core_worker_test.cc
@@ -1512,5 +1512,52 @@ TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_OnDropFiresWhenNotRegistered)
   EXPECT_TRUE(drop_fired);
 }
 
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_MultipleCallbacksAllFire) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  int fire_count = 0;
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_TRUE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
+        object_id, [&fire_count](const ObjectID &) { ++fire_count; }));
+  }
+
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+  FlushObjectFreedCallbacks();
+
+  EXPECT_EQ(fire_count, 3);
+}
+
+TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_FiresExactlyOnce) {
+  auto object_id = ObjectID::FromRandom();
+  rpc::Address owner_address;
+  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
+  reference_counter_->AddOwnedObject(object_id,
+                                     {},
+                                     owner_address,
+                                     "",
+                                     0,
+                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
+                                     /*add_local_ref=*/true);
+
+  int fire_count = 0;
+  ASSERT_TRUE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
+      object_id, [&fire_count](const ObjectID &) { ++fire_count; }));
+
+  reference_counter_->RemoveLocalReference(object_id, nullptr);
+  FlushObjectFreedCallbacks();
+  FlushObjectFreedCallbacks();  // second flush must not re-fire
+
+  EXPECT_EQ(fire_count, 1);
+}
+
 }  // namespace core
 }  // namespace ray

From 738dcaa35e6eb1cf9dc944f0e8c5110aad9ecf40 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 15 Jun 2026 16:00:21 -0700
Subject: [PATCH 11/53] Address comments again

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                        | 54 ++++++------------
 python/ray/includes/libcoreworker.pxd         |  3 +-
 src/ray/core_worker/core_worker.cc            | 16 ++----
 src/ray/core_worker/core_worker.h             | 16 ++----
 src/ray/core_worker/tests/core_worker_test.cc | 57 -------------------
 5 files changed, 26 insertions(+), 120 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index ed2a7646a432..c00bf6af6db4 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2848,46 +2848,26 @@ cdef class GcsClient:
                 ray._private.utils._CALLED_FREQ[name] += 1
         return getattr(self.inner, name)
 
-# Module-level dict: object_id_binary (bytes) -> Python callable.
-# Keeps each registered callback alive while its C++ lambda is pending.
-# There is one CoreWorker per process, so module scope is sufficient.
-_object_out_of_scope_callbacks: dict = {}
-
-
 cdef void _invoke_object_out_of_scope_callback(
-        const CObjectID &c_object_id, void *callback_context) noexcept nogil:
+        const CObjectID &c_object_id, void *user_callback) noexcept nogil:
     """Invoked on the object_freed_callback_service_ thread when an object goes
-    out of scope. Looks up and calls the registered Python callback for the
-    object. The callback receives the object ID as ``bytes``.
+    out of scope. Calls the registered Python callback with the object ID as
+    ``bytes``, then releases the Py_INCREF taken at registration.
 
     Args:
         c_object_id: The C++ ObjectID of the object that went out of scope.
-        callback_context: A ``bytes`` object (the object ID binary) used as the
-            key into ``_object_out_of_scope_callbacks``. Its lifetime is
-            managed by ``_release_callback_context``.
+        user_callback: The Python callable registered by the caller, kept
+            alive by the Py_INCREF in ``add_object_out_of_scope_callback``.
     """
     with gil:
         try:
-            id_binary = <bytes>callback_context
-            callback = _object_out_of_scope_callbacks.get(id_binary)
-            if callback is not None:
-                callback(id_binary)
+            callback = <object>user_callback
+            id_binary = c_object_id.Binary()
+            callback(id_binary)
         except BaseException:
             logger.exception("Error in object out-of-scope callback")
-
-
-cdef void _release_callback_context(void *callback_context) noexcept nogil:
-    """Called by C++ when the out-of-scope lambda is destroyed.
-
-    Fires on both the normal (callback invoked) and shutdown (lambda dropped)
-    paths. Removes the callback from ``_object_out_of_scope_callbacks`` and
-    releases the Py_INCREF taken on the ``bytes`` key in
-    ``add_object_out_of_scope_callback``.
-    """
-    with gil:
-        id_binary = <bytes>callback_context
-        _object_out_of_scope_callbacks.pop(id_binary, None)
-        cpython.Py_DECREF(<object>callback_context)
+        finally:
+            cpython.Py_DECREF(<object>user_callback)
 
 
 cdef class CoreWorker:
@@ -4272,17 +4252,15 @@ cdef class CoreWorker:
                 f"owned by this worker. Object {object_ref.hex()} is owned by worker "
                 f"{c_owner_address.worker_id().hex()}."
             )
-        id_binary = object_ref.binary()
-        _object_out_of_scope_callbacks[id_binary] = callback
-        # id_binary is passed to C++ as a raw void*, so Py_INCREF is required to
-        # prevent GC from collecting it while C++ holds the pointer.
-        cpython.Py_INCREF(id_binary)
-        return CCoreWorkerProcess.GetCoreWorker() \
+        cpython.Py_INCREF(callback)
+        registered = CCoreWorkerProcess.GetCoreWorker() \
             .AddObjectOutOfScopeOrFreedCallback(
                 c_object_id,
                 _invoke_object_out_of_scope_callback,
-                <void *>id_binary,
-                _release_callback_context)
+                <void *>callback)
+        if not registered:
+            cpython.Py_DECREF(callback)
+        return registered
 
     def get_owner_address(self, ObjectRef object_ref):
         cdef:
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 44aabc0d5c25..770133ec126a 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -238,8 +238,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
         c_bool AddObjectOutOfScopeOrFreedCallback(
             const CObjectID &object_id,
             void (*callback)(const CObjectID &, void *) nogil,
-            void *callback_context,
-            void (*on_drop)(void *) nogil)
+            void *callback_context)
         void PutObjectIntoPlasma(const CRayObject &object,
                                  const CObjectID &object_id)
         const CAddress &GetRpcAddress() const
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index 89f96dba9c51..abfbdf249748 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -2485,10 +2485,6 @@ bool CoreWorker::IsTaskCanceled(const TaskID &task_id) const {
 
 bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
     const ObjectID &object_id, const std::function<void(const ObjectID &)> &callback) {
-  RAY_CHECK(HasOwner(object_id))
-      << "AddObjectOutOfScopeOrFreedCallback can only be called for objects owned by "
-         "this worker. Object: "
-      << object_id;
   auto wrapped = [&object_freed_callback_service = object_freed_callback_service_,
                   callback](const ObjectID &id) {
     object_freed_callback_service.post([callback, id]() { callback(id); },
@@ -2500,16 +2496,12 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(
 bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                                     void (*callback)(const ObjectID &,
                                                                      void *),
-                                                    void *callback_context,
-                                                    void (*on_drop)(void *)) {
+                                                    void *callback_context) {
   RAY_CHECK(callback != nullptr) << "callback must not be null";
-  // Wrap callback_context in a shared_ptr so on_drop is called when the lambda is
-  // destroyed — whether the callback fired normally or was dropped at shutdown.
-  auto owned = std::shared_ptr<void>(callback_context, [on_drop](void *p) {
-    if (on_drop) on_drop(p);
-  });
   return AddObjectOutOfScopeOrFreedCallback(
-      object_id, [callback, owned](const ObjectID &id) { callback(id, owned.get()); });
+      object_id, [callback, callback_context](const ObjectID &id) {
+        callback(id, callback_context);
+      });
 }
 
 Status CoreWorker::CancelChildren(const TaskID &task_id, bool force_kill) {
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index e396e7d27e50..7e864af626de 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -420,22 +420,16 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   ///
   /// \param[in] object_id The owned object to watch.
   /// \param[in] callback Function to invoke when the object goes out of scope. Called
-  ///            with (object_id, callback_context). Must remain valid until `on_drop`
-  ///            fires.
-  /// \param[in] callback_context Opaque pointer forwarded unchanged to `callback` and
-  ///            `on_drop`. In the Cython overload, this is a pointer to a Python `bytes`
-  ///            object containing the object ID binary, used as the key into the callback
+  ///            with (object_id, callback_context).
+  /// \param[in] callback_context Opaque pointer forwarded unchanged to `callback`.
+  ///            In the Cython overload, this is a pointer to a Python `bytes` object
+  ///            containing the object ID binary, used as the key into the callback
   ///            registry.
-  /// \param[in] on_drop Destructor for `callback_context`. If non-null, called when the
-  ///            internal lambda is destroyed, on both the normal (callback invoked) and
-  ///            shutdown (lambda dropped without firing) paths. Pass a function that
-  ///            releases any resources held by `callback_context`.
   /// \return true if registered; false if the object is already out of scope or freed
   ///         (callback will never fire).
   bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
                                           void (*callback)(const ObjectID &, void *),
-                                          void *callback_context,
-                                          void (*on_drop)(void *) = nullptr);
+                                          void *callback_context);
 
   int GetMemoryStoreSize() { return memory_store_->Size(); }
 
diff --git a/src/ray/core_worker/tests/core_worker_test.cc b/src/ray/core_worker/tests/core_worker_test.cc
index dc54b51fee47..66529a31c39d 100644
--- a/src/ray/core_worker/tests/core_worker_test.cc
+++ b/src/ray/core_worker/tests/core_worker_test.cc
@@ -1463,63 +1463,6 @@ TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_CFunctionPointerOverload) {
   EXPECT_EQ(result.id, object_id);
 }
 
-TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_OnDropFiresWithCallback) {
-  // on_drop must fire on the normal path (after the callback is invoked and
-  // the internal lambda is destroyed).
-  auto object_id = ObjectID::FromRandom();
-  rpc::Address owner_address;
-  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
-  reference_counter_->AddOwnedObject(object_id,
-                                     {},
-                                     owner_address,
-                                     "",
-                                     0,
-                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
-                                     /*add_local_ref=*/true);
-
-  struct Flags {
-    bool callback_fired = false;
-    bool drop_fired = false;
-  } flags;
-  ASSERT_TRUE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
-      object_id,
-      [](const ObjectID &, void *data) {
-        static_cast<Flags *>(data)->callback_fired = true;
-      },
-      &flags,
-      [](void *data) { static_cast<Flags *>(data)->drop_fired = true; }));
-
-  reference_counter_->RemoveLocalReference(object_id, nullptr);
-  FlushObjectFreedCallbacks();
-
-  EXPECT_TRUE(flags.callback_fired);
-  EXPECT_TRUE(flags.drop_fired);
-}
-
-TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_OnDropFiresWhenNotRegistered) {
-  // on_drop must fire synchronously when registration fails (object already out
-  // of scope), so callers can unconditionally rely on on_drop to release user_data.
-  auto object_id = ObjectID::FromRandom();
-  rpc::Address owner_address;
-  owner_address.set_worker_id(core_worker_->GetWorkerID().Binary());
-  reference_counter_->AddOwnedObject(object_id,
-                                     {},
-                                     owner_address,
-                                     "",
-                                     0,
-                                     LineageReconstructionEligibility::INELIGIBLE_PUT,
-                                     /*add_local_ref=*/true);
-  reference_counter_->RemoveLocalReference(object_id, nullptr);
-
-  bool drop_fired = false;
-  ASSERT_FALSE(core_worker_->AddObjectOutOfScopeOrFreedCallback(
-      object_id,
-      [](const ObjectID &, void *) {},
-      &drop_fired,
-      [](void *data) { *static_cast<bool *>(data) = true; }));
-  EXPECT_TRUE(drop_fired);
-}
-
 TEST_F(CoreWorkerTest, AddObjectOutOfScopeCallback_MultipleCallbacksAllFire) {
   auto object_id = ObjectID::FromRandom();
   rpc::Address owner_address;

From 2f43a3e7c7eff7029a8b25bb4d7f930e13807867 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 17 Jun 2026 15:50:31 -0700
Subject: [PATCH 12/53] Address reviews

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index c00bf6af6db4..837124e874a0 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -4217,7 +4217,8 @@ cdef class CoreWorker:
             CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference(
                 c_object_id)
 
-    def add_object_out_of_scope_callback(self, ObjectRef object_ref, callback):
+    def add_object_out_of_scope_callback(
+            self, ObjectRef object_ref, callback: Callable[[bytes], None]):
         """Register a Python callable to fire when object_ref goes out of scope.
 
         .. warning::
@@ -4226,10 +4227,17 @@ cdef class CoreWorker:
         Can only be called on the worker that owns object_ref. Raises
         ValueError if object_ref is not owned by this worker.
 
+        The callback runs on a dedicated background thread concurrent with the
+        main Python thread. It must be thread-safe; use a lock if it ever accesses
+        state shared with the main thread.
+
+        If the callback raises, the exception is logged and swallowed so that
+        subsequent callbacks are not affected.
+
         Args:
             object_ref: The owned object to watch.
             callback: Called with the object ID as ``bytes`` when the last
-                reference is released. Must be callable.
+                reference is released.
 
         Returns:
             True if registered; False if the object is already out of scope

From 3fa9b001a060f4678af07ad8ce7e3dbfe7e690e2 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 18 Jun 2026 16:05:09 -0700
Subject: [PATCH 13/53] Address comments and ameliorate codebase

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_raylet.pyx                | 34 ++++++++++++++++-----------
 python/ray/includes/libcoreworker.pxd |  1 +
 src/ray/core_worker/core_worker.cc    | 11 +++++++++
 src/ray/core_worker/core_worker.h     | 11 +++++++++
 4 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
index 837124e874a0..a05852e07747 100644
--- a/python/ray/_raylet.pyx
+++ b/python/ray/_raylet.pyx
@@ -2865,7 +2865,16 @@ cdef void _invoke_object_out_of_scope_callback(
             id_binary = c_object_id.Binary()
             callback(id_binary)
         except BaseException:
-            logger.exception("Error in object out-of-scope callback")
+            # Invoked from C++ through a C function pointer, so a propagating
+            # exception would be undefined behavior; that is why we catch
+            # everything here, including KeyboardInterrupt/SystemExit.
+            logger.exception(
+                "Exception in the callback registered via "
+                "CoreWorker.add_object_out_of_scope_callback for object %s. The "
+                "callback must be non-blocking and exception-free, so check it "
+                "for I/O, blocking calls, or bugs that raise.",
+                c_object_id.Hex().decode("ascii"),
+            )
         finally:
             cpython.Py_DECREF(<object>user_callback)
 
@@ -4231,6 +4240,13 @@ cdef class CoreWorker:
         main Python thread. It must be thread-safe; use a lock if it ever accesses
         state shared with the main thread.
 
+        .. warning::
+            The callback runs on a single thread shared by every out-of-scope
+            notification for this worker, so it MUST be O(1) and non-blocking.
+            Anything that blocks here serializes every subsequent callback on
+            this worker. Please do not register any hanging/failing operations
+            here.
+
         If the callback raises, the exception is logged and swallowed so that
         subsequent callbacks are not affected.
 
@@ -4247,19 +4263,9 @@ cdef class CoreWorker:
             raise TypeError(
                 f"callback must be callable, got {type(callback).__name__!r}"
             )
-        cdef:
-            CObjectID c_object_id = object_ref.native()
-            CAddress c_owner_address
-        op_status = CCoreWorkerProcess.GetCoreWorker().GetOwnerAddress(
-            c_object_id, &c_owner_address)
-        check_status(op_status)
-        this_worker_id = CCoreWorkerProcess.GetCoreWorker().GetWorkerID().Binary()
-        if c_owner_address.worker_id() != this_worker_id:
-            raise ValueError(
-                f"add_object_out_of_scope_callback can only be called for objects "
-                f"owned by this worker. Object {object_ref.hex()} is owned by worker "
-                f"{c_owner_address.worker_id().hex()}."
-            )
+        cdef CObjectID c_object_id = object_ref.native()
+        check_status(CCoreWorkerProcess.GetCoreWorker().CheckObjectOwnedByUs(
+            c_object_id))
         cpython.Py_INCREF(callback)
         registered = CCoreWorkerProcess.GetCoreWorker() \
             .AddObjectOutOfScopeOrFreedCallback(
diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd
index 770133ec126a..dbedcf59b3ac 100644
--- a/python/ray/includes/libcoreworker.pxd
+++ b/python/ray/includes/libcoreworker.pxd
@@ -239,6 +239,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
             const CObjectID &object_id,
             void (*callback)(const CObjectID &, void *) nogil,
             void *callback_context)
+        CRayStatus CheckObjectOwnedByUs(const CObjectID &object_id) const
         void PutObjectIntoPlasma(const CRayObject &object,
                                  const CObjectID &object_id)
         const CAddress &GetRpcAddress() const
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
index e3d155012b18..392280665ad1 100644
--- a/src/ray/core_worker/core_worker.cc
+++ b/src/ray/core_worker/core_worker.cc
@@ -2504,6 +2504,17 @@ bool CoreWorker::AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,
       });
 }
 
+Status CoreWorker::CheckObjectOwnedByUs(const ObjectID &object_id) const {
+  if (reference_counter_->OwnedByUs(object_id)) {
+    return Status::OK();
+  }
+  return Status::InvalidArgument(absl::StrFormat(
+      "Cannot register an out-of-scope/freed callback for object %s: it is not "
+      "owned by this worker (it may be owned by another worker, or have no "
+      "ownership record). These callbacks can only be registered by the owner.",
+      object_id.Hex()));
+}
+
 Status CoreWorker::CancelChildren(const TaskID &task_id, bool force_kill) {
   absl::flat_hash_set<TaskID> unknown_child_task_ids;
   auto child_task_ids = task_manager_->GetPendingChildrenTasks(task_id);
diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 7e864af626de..bf4006eacabe 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -431,6 +431,17 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
                                           void (*callback)(const ObjectID &, void *),
                                           void *callback_context);
 
+  /// Validate that the given object is owned by this worker. Used to gate
+  /// owner-only operations (e.g. registering an out-of-scope/freed callback)
+  /// so the error is constructed in C++ and propagated through the standard
+  /// Status path rather than re-implemented at each binding.
+  ///
+  /// \param[in] object_id The object to check.
+  /// \return Status::OK if this worker is the owner of the object;
+  ///         Status::InvalidArgument otherwise (the rejected case in practice is
+  ///         a borrowed object owned by another worker).
+  Status CheckObjectOwnedByUs(const ObjectID &object_id) const;
+
   int GetMemoryStoreSize() { return memory_store_->Size(); }
 
   /// Returns a map of all ObjectIDs currently in scope with a pair of their

From 78d4a231a9ac559130e0c1d7147879d8e9c88af6 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 18 Jun 2026 16:31:12 -0700
Subject: [PATCH 14/53] Add callback latency measuring tests

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_private/ray_perf.py               | 53 +++++++++++++++++++
 .../test_object_out_of_scope_callback.py      | 33 ++++++++++--
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index f17fd2ac01a7..a60f4877f94c 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -3,10 +3,12 @@
 import asyncio
 import logging
 import multiprocessing
+import threading
 
 import numpy as np
 
 import ray
+import ray._private.worker
 from ray._private.ray_client_microbenchmark import main as client_microbenchmark_main
 from ray._private.ray_microbenchmark_helpers import timeit
 
@@ -156,6 +158,57 @@ def get_containing_object_ref():
         "single client get object containing 10k refs", get_containing_object_ref
     )
 
+    # Object out-of-scope callbacks (used by Ray Data's BlockRefCounter for
+    # per-operator memory accounting). These exercise the
+    # ``CoreWorker.add_object_out_of_scope_callback`` path, which registers a
+    # Python callable that fires on a dedicated background thread once the
+    # object's last reference is dropped. We measure two things:
+    #   1. registration overhead, and
+    #   2. end-to-end throughput at scale from dropping the refs to every
+    #      callback firing on the background thread.
+    core_worker = ray._private.worker.global_worker.core_worker
+
+    def register_oos_callback():
+        ref = ray.put(0)
+        core_worker.add_object_out_of_scope_callback(ref, lambda _: None)
+        del ref
+
+    results += timeit(
+        "single client object out-of-scope callback registration",
+        register_oos_callback,
+    )
+
+    def oos_callback_fire_batch():
+        # Batch size is sized for a stable timeit signal (enough work per call to
+        # amortize fixed overhead, small enough that timeit runs it many times),
+        # not to model peak production concurrency.
+        n = 1000
+        remaining = [n]
+        lock = threading.Lock()
+        done = threading.Event()
+
+        def on_freed(_):
+            with lock:
+                remaining[0] -= 1
+                if remaining[0] == 0:
+                    done.set()
+
+        refs = [ray.put(0) for _ in range(n)]
+        for ref in refs:
+            core_worker.add_object_out_of_scope_callback(ref, on_freed)
+        # Drop the last references; every callback must then fire on the
+        # dedicated background thread.
+        del refs
+        if not done.wait(timeout=30):
+            raise TimeoutError(
+                f"Only {n - remaining[0]}/{n} out-of-scope callbacks fired "
+                "within 30s"
+            )
+
+    results += timeit(
+        "object out-of-scope callback fire 1k batch", oos_callback_fire_batch, 1000
+    )
+
     def wait_multiple_refs():
         num_objs = 1000
         not_ready = [small_value.remote() for _ in range(num_objs)]
diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
index e6061bff6c43..b77451cf4b3b 100644
--- a/python/ray/tests/test_object_out_of_scope_callback.py
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -1,6 +1,3 @@
-# These tests require a live Ray cluster because the callback path goes through
-# the C++ ReferenceCounter and the dedicated object_freed_callback_service_ thread.
-
 import threading
 import time
 
@@ -90,6 +87,36 @@ def test_ray_internal_free_triggers_callback(self, ray_instance):
         ray.internal.free([ref])
         assert fired.wait(timeout=5), "Callback did not fire after ray.internal.free"
 
+    def test_many_callbacks_all_fire(self, ray_instance):
+        """At scale, every registered callback fires exactly once after its ref is
+        dropped. Guards the dedicated callback thread against dropped or
+        duplicated notifications under load."""
+        n = 2000
+        remaining = [n]
+        lock = threading.Lock()
+        all_fired = threading.Event()
+        fire_counts = {}
+
+        def on_freed(id_bytes):
+            with lock:
+                fire_counts[id_bytes] = fire_counts.get(id_bytes, 0) + 1
+                remaining[0] -= 1
+                if remaining[0] == 0:
+                    all_fired.set()
+
+        refs = [ray.put(i) for i in range(n)]
+        expected = {r.binary() for r in refs}
+        for ref in refs:
+            assert _core_worker().add_object_out_of_scope_callback(ref, on_freed)
+
+        del refs  # drop every last reference at once
+        assert all_fired.wait(
+            timeout=30
+        ), f"Only {n - remaining[0]}/{n} callbacks fired within 30s"
+        with lock:
+            assert set(fire_counts) == expected, "Unexpected or missing object IDs"
+            assert all(c == 1 for c in fire_counts.values()), "A callback fired twice"
+
     def test_callback_exception_does_not_crash(self, ray_instance):
         """A Python exception inside the callback must not propagate to C++."""
         second_fired = threading.Event()

From a37a262e6491b2b838a63acf4e3c6d9ca736e0a3 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 13:19:41 -0700
Subject: [PATCH 15/53] Put ref registration assertion in test scoped

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/tests/test_object_out_of_scope_callback.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/ray/tests/test_object_out_of_scope_callback.py b/python/ray/tests/test_object_out_of_scope_callback.py
index b77451cf4b3b..081087b76c88 100644
--- a/python/ray/tests/test_object_out_of_scope_callback.py
+++ b/python/ray/tests/test_object_out_of_scope_callback.py
@@ -106,9 +106,10 @@ def on_freed(id_bytes):
 
         refs = [ray.put(i) for i in range(n)]
         expected = {r.binary() for r in refs}
-        for ref in refs:
-            assert _core_worker().add_object_out_of_scope_callback(ref, on_freed)
-
+        assert all(
+            _core_worker().add_object_out_of_scope_callback(ref, on_freed)
+            for ref in refs
+        )
         del refs  # drop every last reference at once
         assert all_fired.wait(
             timeout=30

From 0280347cd8bd169fa6589f60a6a6a2222981a722 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Tue, 23 Jun 2026 14:24:27 -0700
Subject: [PATCH 16/53] Fix ray_perf.py unfired callback

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_private/ray_perf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index a60f4877f94c..f6602ea6acc1 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -194,8 +194,9 @@ def on_freed(_):
                     done.set()
 
         refs = [ray.put(0) for _ in range(n)]
-        for ref in refs:
-            core_worker.add_object_out_of_scope_callback(ref, on_freed)
+        assert all(
+            core_worker.add_object_out_of_scope_callback(ref, on_freed) for ref in refs
+        )
         # Drop the last references; every callback must then fire on the
         # dedicated background thread.
         del refs

From 7903d341292e34b6372a1bc4a52e3e7a54aab83e Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 17:54:18 -0700
Subject: [PATCH 17/53] Scaled-up callback throughput benchmark

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../object_store/callback_throughput.yaml     | 14 +++
 .../object_store/test_callback_throughput.py  | 99 +++++++++++++++++++
 release/release_tests.yaml                    | 24 +++++
 3 files changed, 137 insertions(+)
 create mode 100644 release/benchmarks/object_store/callback_throughput.yaml
 create mode 100644 release/benchmarks/object_store/test_callback_throughput.py

diff --git a/release/benchmarks/object_store/callback_throughput.yaml b/release/benchmarks/object_store/callback_throughput.yaml
new file mode 100644
index 000000000000..5c49cc5f4242
--- /dev/null
+++ b/release/benchmarks/object_store/callback_throughput.yaml
@@ -0,0 +1,14 @@
+cloud: {{env["ANYSCALE_CLOUD_NAME"]}}
+
+head_node:
+    instance_type: m6i.2xlarge
+    resources:
+      node: 1
+
+worker_nodes:
+    - instance_type: m6i.2xlarge
+      min_nodes: 100
+      max_nodes: 100
+      market_type: ON_DEMAND
+      resources:
+        node: 1
diff --git a/release/benchmarks/object_store/test_callback_throughput.py b/release/benchmarks/object_store/test_callback_throughput.py
new file mode 100644
index 000000000000..0e2ff6aa9054
--- /dev/null
+++ b/release/benchmarks/object_store/test_callback_throughput.py
@@ -0,0 +1,99 @@
+import gc
+import json
+import os
+import threading
+import time
+
+import numpy as np
+
+import ray
+import ray._private.worker
+
+NUM_WORKERS = 100
+OBJECT_SIZE = 1024 * 1024  # 1 MiB, above the 100 KB inlining threshold
+
+
+@ray.remote(num_cpus=1)
+def create_object_on_worker():
+    return np.zeros(OBJECT_SIZE, dtype=np.uint8)
+
+
+def test_callback_throughput(num_refs, timeout_s=60):
+    core_worker = ray._private.worker.global_worker.core_worker
+
+    remaining = [num_refs]
+    lock = threading.Lock()
+    done = threading.Event()
+
+    def on_freed(_id_bytes):
+        with lock:
+            remaining[0] -= 1
+            if remaining[0] == 0:
+                done.set()
+
+    refs = [
+        create_object_on_worker.options(scheduling_strategy="SPREAD").remote()
+        for _ in range(num_refs)
+    ]
+    ray.wait(refs, num_returns=len(refs))
+
+    assert all(
+        core_worker.add_object_out_of_scope_callback(ref, on_freed) for ref in refs
+    )
+
+    # Drop all refs at once and measure how long until all callbacks fire.
+    refs_dropped_at = time.perf_counter()
+    del refs
+    gc.collect()
+
+    if not done.wait(timeout=timeout_s):
+        fired = num_refs - remaining[0]
+        raise TimeoutError(
+            f"Only {fired}/{num_refs} callbacks fired within {timeout_s}s"
+        )
+
+    settle_time_s = time.perf_counter() - refs_dropped_at
+    callbacks_per_sec = num_refs / settle_time_s if settle_time_s > 0 else float("inf")
+
+    print(f"  {num_refs} callbacks in {settle_time_s:.3f}s ({callbacks_per_sec:.0f}/s)")
+    return settle_time_s, callbacks_per_sec
+
+
+ray.init(address="auto")
+
+# Warm up gRPC connections and worker pools.
+ray.get(
+    [
+        create_object_on_worker.options(scheduling_strategy="SPREAD").remote()
+        for _ in range(NUM_WORKERS)
+    ]
+)
+
+settle_1k, throughput_1k = test_callback_throughput(1000)
+settle_5k, throughput_5k = test_callback_throughput(5000)
+
+print("\nSummary:")
+print(f"  1k: {throughput_1k:.0f}/s (settle: {settle_1k:.3f}s)")
+print(f"  5k: {throughput_5k:.0f}/s (settle: {settle_5k:.3f}s)")
+
+if "TEST_OUTPUT_JSON" in os.environ:
+    with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_file:
+        results = {
+            "settle_1k": settle_1k,
+            "settle_5k": settle_5k,
+            "throughput_1k": throughput_1k,
+            "throughput_5k": throughput_5k,
+            "perf_metrics": [
+                {
+                    "perf_metric_name": "callback_burst_1k_per_second",
+                    "perf_metric_value": throughput_1k,
+                    "perf_metric_type": "THROUGHPUT",
+                },
+                {
+                    "perf_metric_name": "callback_burst_5k_per_second",
+                    "perf_metric_value": throughput_5k,
+                    "perf_metric_type": "THROUGHPUT",
+                },
+            ],
+        }
+        json.dump(results, out_file)
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index aed53d63d920..0dc47429592c 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -3219,6 +3219,30 @@
       cluster:
         cluster_compute: object_store_gce.yaml
 
+- name: callback_throughput
+  python: "3.10"
+  group: core-scalability-test
+  working_dir: benchmarks
+
+  frequency: nightly
+  team: core
+  env: aws_perf
+  cluster:
+    anyscale_sdk_2026: true
+    byod:
+      runtime_env:
+        - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so
+    cluster_compute: object_store/callback_throughput.yaml
+
+  run:
+    timeout: 600
+    script: python object_store/test_callback_throughput.py
+    wait_for_nodes:
+      num_nodes: 101
+
+  variations:
+    - __suffix__: aws
+
 - name: small_objects
   python: "3.10"
   group: core-scalability-test

From 8b22ae947b6aa250630fe639fe379c6dccae19aa Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 18:21:58 -0700
Subject: [PATCH 18/53] Bump up thread count due to new cleanup thread

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/tests/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py
index f9480d6897f8..9716a8d13e74 100644
--- a/python/ray/tests/test_basic.py
+++ b/python/ray/tests/test_basic.py
@@ -268,7 +268,7 @@ def get_thread_count(self):
         ray.get(actor.get_thread_count.remote())
     # Lowering these numbers in this assert should be celebrated,
     # increasing these numbers should be scrutinized
-    assert ray.get(actor.get_thread_count.remote()) in {20, 21, 22, 23}
+    assert ray.get(actor.get_thread_count.remote()) in {20, 21, 22, 23, 24}
 
 
 # https://github.com/ray-project/ray/issues/7287

From 16eca1f667e8112432afe8bab2433df32f607051 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 18:37:20 -0700
Subject: [PATCH 19/53] Remove 20 from thread count since we net increased
 thread count

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/tests/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py
index 9716a8d13e74..ff3f2e260bfa 100644
--- a/python/ray/tests/test_basic.py
+++ b/python/ray/tests/test_basic.py
@@ -268,7 +268,7 @@ def get_thread_count(self):
         ray.get(actor.get_thread_count.remote())
     # Lowering these numbers in this assert should be celebrated,
     # increasing these numbers should be scrutinized
-    assert ray.get(actor.get_thread_count.remote()) in {20, 21, 22, 23, 24}
+    assert ray.get(actor.get_thread_count.remote()) in {21, 22, 23, 24}
 
 
 # https://github.com/ray-project/ray/issues/7287

From 13f4e4600aa1096e8fef8aebeb88a8ecd6cc4a01 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 12:09:16 -0700
Subject: [PATCH 20/53] Integrate more realistic benchmark

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/_private/ray_perf.py               | 54 ---------------
 .../object_store/callback_throughput.yaml     |  4 +-
 .../object_store/test_callback_throughput.py  | 68 +++++++++++--------
 release/release_tests.yaml                    |  2 +-
 4 files changed, 41 insertions(+), 87 deletions(-)

diff --git a/python/ray/_private/ray_perf.py b/python/ray/_private/ray_perf.py
index f6602ea6acc1..f17fd2ac01a7 100644
--- a/python/ray/_private/ray_perf.py
+++ b/python/ray/_private/ray_perf.py
@@ -3,12 +3,10 @@
 import asyncio
 import logging
 import multiprocessing
-import threading
 
 import numpy as np
 
 import ray
-import ray._private.worker
 from ray._private.ray_client_microbenchmark import main as client_microbenchmark_main
 from ray._private.ray_microbenchmark_helpers import timeit
 
@@ -158,58 +156,6 @@ def get_containing_object_ref():
         "single client get object containing 10k refs", get_containing_object_ref
     )
 
-    # Object out-of-scope callbacks (used by Ray Data's BlockRefCounter for
-    # per-operator memory accounting). These exercise the
-    # ``CoreWorker.add_object_out_of_scope_callback`` path, which registers a
-    # Python callable that fires on a dedicated background thread once the
-    # object's last reference is dropped. We measure two things:
-    #   1. registration overhead, and
-    #   2. end-to-end throughput at scale from dropping the refs to every
-    #      callback firing on the background thread.
-    core_worker = ray._private.worker.global_worker.core_worker
-
-    def register_oos_callback():
-        ref = ray.put(0)
-        core_worker.add_object_out_of_scope_callback(ref, lambda _: None)
-        del ref
-
-    results += timeit(
-        "single client object out-of-scope callback registration",
-        register_oos_callback,
-    )
-
-    def oos_callback_fire_batch():
-        # Batch size is sized for a stable timeit signal (enough work per call to
-        # amortize fixed overhead, small enough that timeit runs it many times),
-        # not to model peak production concurrency.
-        n = 1000
-        remaining = [n]
-        lock = threading.Lock()
-        done = threading.Event()
-
-        def on_freed(_):
-            with lock:
-                remaining[0] -= 1
-                if remaining[0] == 0:
-                    done.set()
-
-        refs = [ray.put(0) for _ in range(n)]
-        assert all(
-            core_worker.add_object_out_of_scope_callback(ref, on_freed) for ref in refs
-        )
-        # Drop the last references; every callback must then fire on the
-        # dedicated background thread.
-        del refs
-        if not done.wait(timeout=30):
-            raise TimeoutError(
-                f"Only {n - remaining[0]}/{n} out-of-scope callbacks fired "
-                "within 30s"
-            )
-
-    results += timeit(
-        "object out-of-scope callback fire 1k batch", oos_callback_fire_batch, 1000
-    )
-
     def wait_multiple_refs():
         num_objs = 1000
         not_ready = [small_value.remote() for _ in range(num_objs)]
diff --git a/release/benchmarks/object_store/callback_throughput.yaml b/release/benchmarks/object_store/callback_throughput.yaml
index 5c49cc5f4242..f7073a6bffe0 100644
--- a/release/benchmarks/object_store/callback_throughput.yaml
+++ b/release/benchmarks/object_store/callback_throughput.yaml
@@ -7,8 +7,8 @@ head_node:
 
 worker_nodes:
     - instance_type: m6i.2xlarge
-      min_nodes: 100
-      max_nodes: 100
+      min_nodes: 10
+      max_nodes: 10
       market_type: ON_DEMAND
       resources:
         node: 1
diff --git a/release/benchmarks/object_store/test_callback_throughput.py b/release/benchmarks/object_store/test_callback_throughput.py
index 0e2ff6aa9054..5a5d62ec48b2 100644
--- a/release/benchmarks/object_store/test_callback_throughput.py
+++ b/release/benchmarks/object_store/test_callback_throughput.py
@@ -9,19 +9,24 @@
 import ray
 import ray._private.worker
 
-NUM_WORKERS = 100
+NUM_WORKERS = 10
 OBJECT_SIZE = 1024 * 1024  # 1 MiB, above the 100 KB inlining threshold
 
 
 @ray.remote(num_cpus=1)
-def create_object_on_worker():
+def produce_block():
     return np.zeros(OBJECT_SIZE, dtype=np.uint8)
 
 
-def test_callback_throughput(num_refs, timeout_s=60):
+@ray.remote(num_cpus=1)
+def consume_block(block_ref):
+    return len(block_ref)
+
+
+def test_callback_pipeline(num_blocks, timeout_s=60):
     core_worker = ray._private.worker.global_worker.core_worker
 
-    remaining = [num_refs]
+    remaining = [num_blocks]
     lock = threading.Lock()
     done = threading.Event()
 
@@ -31,32 +36,37 @@ def on_freed(_id_bytes):
             if remaining[0] == 0:
                 done.set()
 
+    start = time.perf_counter()
+
+    # Produce blocks (models MapOperator submitting tasks).
     refs = [
-        create_object_on_worker.options(scheduling_strategy="SPREAD").remote()
-        for _ in range(num_refs)
+        produce_block.options(scheduling_strategy="SPREAD").remote()
+        for _ in range(num_blocks)
     ]
     ray.wait(refs, num_returns=len(refs))
 
+    # Register callbacks (models BlockRefCounter.on_block_produced).
     assert all(
         core_worker.add_object_out_of_scope_callback(ref, on_freed) for ref in refs
     )
 
-    # Drop all refs at once and measure how long until all callbacks fire.
-    refs_dropped_at = time.perf_counter()
+    # Pass to consumers (models downstream operator receiving blocks).
+    consumer_futures = [consume_block.remote(ref) for ref in refs]
+    ray.get(consumer_futures)
+
+    # Drop all references (models blocks going out of scope after consumption).
     del refs
     gc.collect()
 
     if not done.wait(timeout=timeout_s):
-        fired = num_refs - remaining[0]
+        fired = num_blocks - remaining[0]
         raise TimeoutError(
-            f"Only {fired}/{num_refs} callbacks fired within {timeout_s}s"
+            f"Only {fired}/{num_blocks} callbacks fired within {timeout_s}s"
         )
 
-    settle_time_s = time.perf_counter() - refs_dropped_at
-    callbacks_per_sec = num_refs / settle_time_s if settle_time_s > 0 else float("inf")
-
-    print(f"  {num_refs} callbacks in {settle_time_s:.3f}s ({callbacks_per_sec:.0f}/s)")
-    return settle_time_s, callbacks_per_sec
+    total_s = time.perf_counter() - start
+    print(f"  {num_blocks} blocks: {total_s:.3f}s")
+    return total_s
 
 
 ray.init(address="auto")
@@ -64,35 +74,33 @@ def on_freed(_id_bytes):
 # Warm up gRPC connections and worker pools.
 ray.get(
     [
-        create_object_on_worker.options(scheduling_strategy="SPREAD").remote()
+        produce_block.options(scheduling_strategy="SPREAD").remote()
         for _ in range(NUM_WORKERS)
     ]
 )
 
-settle_1k, throughput_1k = test_callback_throughput(1000)
-settle_5k, throughput_5k = test_callback_throughput(5000)
+time_100 = test_callback_pipeline(100)
+time_1k = test_callback_pipeline(1000)
 
 print("\nSummary:")
-print(f"  1k: {throughput_1k:.0f}/s (settle: {settle_1k:.3f}s)")
-print(f"  5k: {throughput_5k:.0f}/s (settle: {settle_5k:.3f}s)")
+print(f"  100 blocks: {time_100:.3f}s")
+print(f"  1k blocks: {time_1k:.3f}s")
 
 if "TEST_OUTPUT_JSON" in os.environ:
     with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_file:
         results = {
-            "settle_1k": settle_1k,
-            "settle_5k": settle_5k,
-            "throughput_1k": throughput_1k,
-            "throughput_5k": throughput_5k,
+            "time_100": time_100,
+            "time_1k": time_1k,
             "perf_metrics": [
                 {
-                    "perf_metric_name": "callback_burst_1k_per_second",
-                    "perf_metric_value": throughput_1k,
-                    "perf_metric_type": "THROUGHPUT",
+                    "perf_metric_name": "callback_pipeline_100_blocks_s",
+                    "perf_metric_value": time_100,
+                    "perf_metric_type": "LATENCY",
                 },
                 {
-                    "perf_metric_name": "callback_burst_5k_per_second",
-                    "perf_metric_value": throughput_5k,
-                    "perf_metric_type": "THROUGHPUT",
+                    "perf_metric_name": "callback_pipeline_1k_blocks_s",
+                    "perf_metric_value": time_1k,
+                    "perf_metric_type": "LATENCY",
                 },
             ],
         }
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 70fa33f90b25..849bea7023a4 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -3238,7 +3238,7 @@
     timeout: 600
     script: python object_store/test_callback_throughput.py
     wait_for_nodes:
-      num_nodes: 101
+      num_nodes: 11
 
   variations:
     - __suffix__: aws

From b30f0dd9d923cd176270ce6a5bb40d7401a5dd69 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Sun, 28 Jun 2026 16:52:52 -0700
Subject: [PATCH 21/53] Remove confusing comment

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 src/ray/core_worker/core_worker.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
index 5b63b17b505f..6041ae3c81b5 100644
--- a/src/ray/core_worker/core_worker.h
+++ b/src/ray/core_worker/core_worker.h
@@ -422,9 +422,6 @@ class CoreWorker : public std::enable_shared_from_this<CoreWorker> {
   /// \param[in] callback Function to invoke when the object goes out of scope. Called
   ///            with (object_id, callback_context).
   /// \param[in] callback_context Opaque pointer forwarded unchanged to `callback`.
-  ///            In the Cython overload, this is a pointer to a Python `bytes` object
-  ///            containing the object ID binary, used as the key into the callback
-  ///            registry.
   /// \return true if registered; false if the object is already out of scope or freed
   ///         (callback will never fire).
   bool AddObjectOutOfScopeOrFreedCallback(const ObjectID &object_id,

From 4b81ad9b7c6b7c7b56e72959bf638ed556426be0 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Sun, 28 Jun 2026 17:43:07 -0700
Subject: [PATCH 22/53] Make test_callback_throughput benchmark more reflective
 of actual Ray Data execution logic

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../object_store/test_callback_throughput.py  | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/release/benchmarks/object_store/test_callback_throughput.py b/release/benchmarks/object_store/test_callback_throughput.py
index 5a5d62ec48b2..0d47d930b029 100644
--- a/release/benchmarks/object_store/test_callback_throughput.py
+++ b/release/benchmarks/object_store/test_callback_throughput.py
@@ -1,4 +1,3 @@
-import gc
 import json
 import os
 import threading
@@ -26,47 +25,48 @@ def consume_block(block_ref):
 def test_callback_pipeline(num_blocks, timeout_s=60):
     core_worker = ray._private.worker.global_worker.core_worker
 
-    remaining = [num_blocks]
+    latencies = []
+    drop_times = {}
     lock = threading.Lock()
     done = threading.Event()
 
-    def on_freed(_id_bytes):
+    def on_freed(id_bytes):
         with lock:
-            remaining[0] -= 1
-            if remaining[0] == 0:
+            latencies.append(time.perf_counter() - drop_times[id_bytes])
+            if len(latencies) == num_blocks:
                 done.set()
 
-    start = time.perf_counter()
-
-    # Produce blocks (models MapOperator submitting tasks).
     refs = [
         produce_block.options(scheduling_strategy="SPREAD").remote()
         for _ in range(num_blocks)
     ]
     ray.wait(refs, num_returns=len(refs))
 
-    # Register callbacks (models BlockRefCounter.on_block_produced).
-    assert all(
-        core_worker.add_object_out_of_scope_callback(ref, on_freed) for ref in refs
-    )
-
-    # Pass to consumers (models downstream operator receiving blocks).
-    consumer_futures = [consume_block.remote(ref) for ref in refs]
-    ray.get(consumer_futures)
-
-    # Drop all references (models blocks going out of scope after consumption).
+    # live_refs keeps each block ref alive until its consumer completes.
+    live_refs = {}
+    for ref in refs:
+        assert core_worker.add_object_out_of_scope_callback(ref, on_freed)
+        consumer = consume_block.remote(ref)
+        live_refs[consumer] = ref
     del refs
-    gc.collect()
+
+    # Release each ref as its consumer completes.
+    pending = list(live_refs.keys())
+    while pending:
+        done_list, pending = ray.wait(pending, num_returns=1)
+        for consumer in done_list:
+            ref = live_refs.pop(consumer)
+            drop_times[ref.binary()] = time.perf_counter()
+            del ref
 
     if not done.wait(timeout=timeout_s):
-        fired = num_blocks - remaining[0]
         raise TimeoutError(
-            f"Only {fired}/{num_blocks} callbacks fired within {timeout_s}s"
+            f"Only {len(latencies)}/{num_blocks} callbacks fired within {timeout_s}s"
         )
 
-    total_s = time.perf_counter() - start
-    print(f"  {num_blocks} blocks: {total_s:.3f}s")
-    return total_s
+    avg_latency = sum(latencies) / len(latencies)
+    print(f"  {num_blocks} blocks: avg={avg_latency:.4f}s")
+    return avg_latency
 
 
 ray.init(address="auto")

From 5a9e1ed21c36f39e86de74666a7de4e6c6937581 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 29 Jun 2026 09:54:58 -0700
Subject: [PATCH 23/53] Switch to p95 percentile to more accurately capture
 regression

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../object_store/test_callback_throughput.py  | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/release/benchmarks/object_store/test_callback_throughput.py b/release/benchmarks/object_store/test_callback_throughput.py
index 0d47d930b029..b45ba848806d 100644
--- a/release/benchmarks/object_store/test_callback_throughput.py
+++ b/release/benchmarks/object_store/test_callback_throughput.py
@@ -64,9 +64,10 @@ def on_freed(id_bytes):
             f"Only {len(latencies)}/{num_blocks} callbacks fired within {timeout_s}s"
         )
 
-    avg_latency = sum(latencies) / len(latencies)
-    print(f"  {num_blocks} blocks: avg={avg_latency:.4f}s")
-    return avg_latency
+    latencies.sort()
+    p95 = latencies[int(len(latencies) * 0.95)]
+    print(f"  {num_blocks} blocks: p95={p95:.4f}s")
+    return p95
 
 
 ray.init(address="auto")
@@ -79,27 +80,27 @@ def on_freed(id_bytes):
     ]
 )
 
-time_100 = test_callback_pipeline(100)
-time_1k = test_callback_pipeline(1000)
+p95_100 = test_callback_pipeline(100)
+p95_1k = test_callback_pipeline(1000)
 
 print("\nSummary:")
-print(f"  100 blocks: {time_100:.3f}s")
-print(f"  1k blocks: {time_1k:.3f}s")
+print(f"  100 blocks: p95={p95_100:.4f}s")
+print(f"  1k blocks: p95={p95_1k:.4f}s")
 
 if "TEST_OUTPUT_JSON" in os.environ:
     with open(os.environ["TEST_OUTPUT_JSON"], "w") as out_file:
         results = {
-            "time_100": time_100,
-            "time_1k": time_1k,
+            "p95_100": p95_100,
+            "p95_1k": p95_1k,
             "perf_metrics": [
                 {
-                    "perf_metric_name": "callback_pipeline_100_blocks_s",
-                    "perf_metric_value": time_100,
+                    "perf_metric_name": "callback_p95_latency_100_blocks_s",
+                    "perf_metric_value": p95_100,
                     "perf_metric_type": "LATENCY",
                 },
                 {
-                    "perf_metric_name": "callback_pipeline_1k_blocks_s",
-                    "perf_metric_value": time_1k,
+                    "perf_metric_name": "callback_p95_latency_1k_blocks_s",
+                    "perf_metric_value": p95_1k,
                     "perf_metric_type": "LATENCY",
                 },
             ],

From 15c73f591b189da6ae99c49e4668b2454346c92a Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 17 Jun 2026 15:14:22 -0700
Subject: [PATCH 24/53] Add BlockRefCounter Implementation and Tests

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/BUILD.bazel                   |  14 +
 .../_internal/execution/block_ref_counter.py  |  72 +++++
 .../ray/data/tests/test_block_ref_counter.py  | 297 ++++++++++++++++++
 3 files changed, 383 insertions(+)
 create mode 100644 python/ray/data/_internal/execution/block_ref_counter.py
 create mode 100644 python/ray/data/tests/test_block_ref_counter.py

diff --git a/python/ray/data/BUILD.bazel b/python/ray/data/BUILD.bazel
index 960aa67d891f..5d859b890ec4 100644
--- a/python/ray/data/BUILD.bazel
+++ b/python/ray/data/BUILD.bazel
@@ -1426,6 +1426,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_block_ref_counter",
+    size = "small",
+    srcs = ["tests/test_block_ref_counter.py"],
+    tags = [
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
 py_test(
     name = "test_map_operator",
     size = "medium",
diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
new file mode 100644
index 000000000000..6cd5bed5f5f0
--- /dev/null
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -0,0 +1,72 @@
+import threading
+from collections import defaultdict
+from typing import Dict
+
+import ray
+import ray._private.worker
+
+
+class BlockRefCounter:
+    """Tracks object-store memory usage per operator via Ray Core callbacks.
+
+    The callback fires when:
+    - All Python ObjectRefs wrapping the block's ObjectID are garbage-collected, AND
+    - All Ray tasks that received the block as an argument have completed.
+    """
+
+    def __init__(self):
+        # Object ID binaries of currently live blocks; used by _on_object_freed
+        # to distinguish a racing clear() from a real callback.
+        self._registered_ids: set[bytes] = set()
+        # (producer_id -> total live bytes); maintained incrementally for O(1) reads.
+        self._bytes_by_producer: Dict[str, int] = defaultdict(int)
+        self._lock = threading.Lock()
+
+    def on_block_produced(
+        self,
+        block_ref: "ray.ObjectRef",
+        size_bytes: int,
+        producer_id: str,
+    ) -> None:
+        """Register a block and attribute its memory to producer_id.
+
+        Registers a Ray Core out-of-scope callback so that when all references
+        to block_ref are gone the bytes are automatically removed from the
+        producer's usage.
+        """
+        id_binary = block_ref.binary()
+        with self._lock:
+            self._registered_ids.add(id_binary)
+            self._bytes_by_producer[producer_id] += size_bytes
+
+        def _on_object_freed(id_bytes: bytes) -> None:
+            with self._lock:
+                if id_bytes not in self._registered_ids:
+                    # Already cleared (e.g. by clear()), nothing to do.
+                    return
+                self._registered_ids.discard(id_bytes)
+                self._bytes_by_producer[producer_id] -= size_bytes
+                if self._bytes_by_producer[producer_id] == 0:
+                    del self._bytes_by_producer[producer_id]
+
+        core_worker = ray._private.worker.global_worker.core_worker  # type: ignore[attr-defined]
+        registered = core_worker.add_object_out_of_scope_callback(
+            block_ref, _on_object_freed
+        )
+        if not registered:
+            _on_object_freed(id_binary)
+
+    def get_object_store_memory_usage(self, producer_id: str) -> int:
+        """Total bytes of live blocks attributed to producer_id."""
+        with self._lock:
+            return self._bytes_by_producer.get(producer_id, 0)
+
+    def clear(self) -> None:
+        """Reset all accounting, e.g. on executor shutdown.
+
+        Any previously registered Ray Core callbacks firing after clear()
+        will be silently ignored because _registered_ids is empty.
+        """
+        with self._lock:
+            self._registered_ids.clear()
+            self._bytes_by_producer.clear()
diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
new file mode 100644
index 000000000000..4cf9caed7809
--- /dev/null
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -0,0 +1,297 @@
+import gc
+import threading
+import time
+import unittest.mock as mock
+
+import pytest
+
+import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+from ray.tests.conftest import *  # noqa
+
+
+class _FakeRef:
+    """Minimal stand-in for ray.ObjectRef. Has a .binary() that returns bytes."""
+
+    def __init__(self, uid: int):
+        self._binary = uid.to_bytes(28, "big")
+
+    def binary(self) -> bytes:
+        return self._binary
+
+
+def _register_block(counter, ref, size_bytes, producer_id):
+    """Call on_block_produced on an existing counter with a mocked core worker.
+
+    Returns the captured _on_out_of_scope callback so tests can fire it directly.
+    """
+    captured_callback = None
+
+    class _MockCoreWorker:
+        def add_object_out_of_scope_callback(self, block_ref, cb):
+            nonlocal captured_callback
+            captured_callback = cb
+            return True
+
+    with mock.patch(
+        "ray._private.worker.global_worker",
+        mock.Mock(core_worker=_MockCoreWorker()),
+    ):
+        counter.on_block_produced(ref, size_bytes, producer_id)
+
+    return captured_callback, ref.binary()
+
+
+class TestBlockRefCounterAccounting:
+    def test_single_block_produced_and_released(self):
+        counter = BlockRefCounter()
+        ref = _FakeRef(1)
+        callback, id_binary = _register_block(counter, ref, 100, "op_a")
+
+        assert counter.get_object_store_memory_usage("op_a") == 100
+        callback(id_binary)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_multiple_blocks_same_producer(self):
+        counter = BlockRefCounter()
+        ref1, ref2 = _FakeRef(1), _FakeRef(2)
+        cb1, bin1 = _register_block(counter, ref1, 100, "op_a")
+        cb2, bin2 = _register_block(counter, ref2, 200, "op_a")
+
+        assert counter.get_object_store_memory_usage("op_a") == 300
+        cb1(bin1)
+        assert counter.get_object_store_memory_usage("op_a") == 200
+        cb2(bin2)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_multiple_producers_isolated(self):
+        counter = BlockRefCounter()
+        ref1, ref2 = _FakeRef(1), _FakeRef(2)
+        cb1, bin1 = _register_block(counter, ref1, 100, "op_a")
+        _register_block(counter, ref2, 200, "op_b")
+
+        assert counter.get_object_store_memory_usage("op_a") == 100
+        assert counter.get_object_store_memory_usage("op_b") == 200
+
+        cb1(bin1)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+        assert counter.get_object_store_memory_usage("op_b") == 200
+
+
+class TestBlockRefCounterClear:
+    def test_clear_resets_usage(self):
+        counter = BlockRefCounter()
+        _register_block(counter, _FakeRef(1), 100, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 100
+
+        counter.clear()
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_callback_after_clear_is_noop(self):
+        """A callback firing after clear() must not crash or corrupt state."""
+        counter = BlockRefCounter()
+        ref = _FakeRef(1)
+        callback, id_binary = _register_block(counter, ref, 100, "op_a")
+
+        counter.clear()
+        callback(id_binary)  # must be a silent no-op
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_new_blocks_after_clear_are_tracked(self):
+        """After clear(), new registrations work normally."""
+        counter = BlockRefCounter()
+        _register_block(counter, _FakeRef(1), 50, "op_b")
+        counter.clear()
+        assert counter.get_object_store_memory_usage("op_b") == 0
+
+        _register_block(counter, _FakeRef(2), 50, "op_b")
+        assert counter.get_object_store_memory_usage("op_b") == 50
+
+    def test_clear_races_with_object_already_freed(self):
+        """clear() between byte-increment and the registered=False undo must not go negative.
+
+        If add_object_out_of_scope_callback returns False (object already gone),
+        on_block_produced calls _on_object_freed to undo the increment. If clear()
+        fires in that window, the undo must be a no-op (id_binary is no longer in
+        _registered_ids), not a double-decrement.
+        """
+        counter = BlockRefCounter()
+        ref = _FakeRef(1)
+
+        class _ClearOnRegisterCoreWorker:
+            def add_object_out_of_scope_callback(self, block_ref, cb):
+                counter.clear()  # race: clear fires before finally runs
+                return False  # object already out of scope
+
+        with mock.patch(
+            "ray._private.worker.global_worker",
+            mock.Mock(core_worker=_ClearOnRegisterCoreWorker()),
+        ):
+            counter.on_block_produced(ref, 100, "op_a")
+
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+
+class TestBlockRefCounterThreadSafety:
+    def test_concurrent_callbacks_dont_corrupt_state(self):
+        """Multiple threads firing callbacks concurrently must not go negative."""
+        counter = BlockRefCounter()
+        producer_id = "op_concurrent"
+        n = 50
+        refs = [_FakeRef(i) for i in range(n)]
+        callbacks = []
+
+        for ref in refs:
+            cb, id_binary = _register_block(counter, ref, 10, producer_id)
+            callbacks.append((cb, id_binary))
+
+        threads = [threading.Thread(target=cb, args=(idb,)) for cb, idb in callbacks]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert counter.get_object_store_memory_usage(producer_id) == 0
+
+
+@ray.remote
+def _hold_ref_for(block_ref, sleep_s: float) -> bool:
+    """Hold *block_ref* as a task argument for *sleep_s* seconds, then return.
+
+    Because Ray keeps the object alive for the duration of any task that
+    received it as an argument, this lets tests verify the callback has
+    not fired while the task is still running.
+    """
+    import time as _time
+
+    _time.sleep(sleep_s)
+    return True
+
+
+def _wait_for_counter(
+    counter: BlockRefCounter,
+    producer_id: str,
+    expected: int,
+    timeout_s: float = 10.0,
+    poll_interval_s: float = 0.05,
+) -> bool:
+    """Poll until *counter* reports *expected* bytes for *producer_id*.
+
+    Calls ``gc.collect()`` on every iteration so that any pending Python-level
+    ObjectRef destructors have a chance to run.  Returns True if the expected
+    value is reached before *timeout_s* elapses, False otherwise.
+    """
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        gc.collect()
+        if counter.get_object_store_memory_usage(producer_id) == expected:
+            return True
+        time.sleep(poll_interval_s)
+    return False
+
+
+class TestBlockRefCounterLifecycle:
+    """Integration tests that exercise the full add_object_out_of_scope_callback path.
+
+    All tests in this class require a live Ray cluster (ray_start_regular_shared).
+    They verify that the out-of-scope callback fires at exactly the right moment:
+    not before the last reference drops, and not after it.
+
+    Three cases are covered:
+      1. Basic lifecycle: callback fires after the last Python ObjectRef is GC'd.
+      2. Two Python refs: callback fires only after both refs are dropped.
+      3. Task ref: callback fires only after the holding task finishes and all
+         Python refs are dropped. This matches the real operator lifecycle where
+         a block stays live until the task that received it as an argument completes.
+    """
+
+    # Byte count attributed to the test operator.  The actual object put into
+    # the store is much smaller; we only care that the counter tracks *this*
+    # number faithfully.
+    _SIZE_BYTES = 1 * 1024 * 1024  # 1 MB
+
+    def _make_block(self) -> "ray.ObjectRef":
+        import numpy as np
+
+        return ray.put(np.zeros(128, dtype=np.float64))
+
+    def test_callback_fires_after_last_python_ref_deleted(
+        self, ray_start_regular_shared
+    ):
+        """Counter reaches 0 once the only Python ObjectRef is GC'd."""
+        counter = BlockRefCounter()
+        ref = self._make_block()
+
+        counter.on_block_produced(ref, self._SIZE_BYTES, "op_basic")
+        assert counter.get_object_store_memory_usage("op_basic") == self._SIZE_BYTES
+
+        del ref  # last Python ref gone
+        assert _wait_for_counter(counter, "op_basic", 0), (
+            "Counter did not reach 0 after all Python refs were deleted; "
+            f"remaining: {counter.get_object_store_memory_usage('op_basic')} bytes"
+        )
+
+    def test_second_python_ref_keeps_counter_alive(self, ray_start_regular_shared):
+        """Counter stays non-zero while a second Python ObjectRef is alive.
+
+        Dropping one of two refs that point at the same ObjectID must NOT fire
+        the callback. Only the final ref drop may do so.
+        """
+        counter = BlockRefCounter()
+        ref1 = self._make_block()
+        ref2 = ref1  # second Python ref to the same ObjectID
+
+        counter.on_block_produced(ref1, self._SIZE_BYTES, "op_two_refs")
+        assert counter.get_object_store_memory_usage("op_two_refs") == self._SIZE_BYTES
+
+        del ref1
+        gc.collect()
+        time.sleep(0.3)  # give GC ample time; counter must still be non-zero
+
+        assert (
+            counter.get_object_store_memory_usage("op_two_refs") == self._SIZE_BYTES
+        ), "Callback fired too early — counter decremented while ref2 was still alive"
+
+        del ref2  # last ref gone; callback must now fire
+        assert _wait_for_counter(
+            counter, "op_two_refs", 0
+        ), "Counter did not reach 0 after the last Python ref was deleted"
+
+    def test_task_ref_keeps_counter_alive_until_task_completes(
+        self, ray_start_regular_shared
+    ):
+        """Counter stays non-zero while a running Ray task holds the block.
+
+        Ray keeps any object alive for the duration of a task that received it
+        as an argument.  The callback should not fire until both conditions hold:
+        (a) the task has completed, and (b) all Python refs are dropped.
+        """
+        counter = BlockRefCounter()
+        ref = self._make_block()
+
+        counter.on_block_produced(ref, self._SIZE_BYTES, "op_task")
+        assert counter.get_object_store_memory_usage("op_task") == self._SIZE_BYTES
+
+        # Submit a task that sleeps for 1 s while holding the block, then drop
+        # the Python ref so only the task's argument reference remains.
+        task_future = _hold_ref_for.remote(ref, 1.0)
+        del ref
+        gc.collect()
+        time.sleep(0.3)  # task is still running; callback must NOT have fired
+
+        assert (
+            counter.get_object_store_memory_usage("op_task") == self._SIZE_BYTES
+        ), "Callback fired too early: counter decremented while task was still running"
+
+        ray.get(task_future)  # task completes; now both refs are gone
+        assert _wait_for_counter(
+            counter, "op_task", 0
+        ), "Counter did not reach 0 after task completed and Python ref was deleted"
+
+
+if __name__ == "__main__":
+    import sys
+
+    import pytest
+
+    sys.exit(pytest.main(["-v", __file__]))

From b17bbc9b69eea189f60662bf9ac2a517527625e1 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 12:03:25 -0700
Subject: [PATCH 25/53] Address comments and update tests

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/block_ref_counter.py  |  31 +-
 .../ray/data/tests/test_block_ref_counter.py  | 282 ++++--------------
 .../data/tests/unit/test_block_ref_counter.py | 140 +++++++++
 3 files changed, 223 insertions(+), 230 deletions(-)
 create mode 100644 python/ray/data/tests/unit/test_block_ref_counter.py

diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
index 6cd5bed5f5f0..148b8f4a7bdb 100644
--- a/python/ray/data/_internal/execution/block_ref_counter.py
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -1,9 +1,8 @@
 import threading
 from collections import defaultdict
-from typing import Dict
+from typing import Callable, Dict, Optional
 
 import ray
-import ray._private.worker
 
 
 class BlockRefCounter:
@@ -14,9 +13,15 @@ class BlockRefCounter:
     - All Ray tasks that received the block as an argument have completed.
     """
 
-    def __init__(self):
-        # Object ID binaries of currently live blocks; used by _on_object_freed
-        # to distinguish a racing clear() from a real callback.
+    def __init__(
+        self,
+        add_object_out_of_scope_callback: Optional[
+            Callable[["ray.ObjectRef", Callable[[bytes], None]], bool]
+        ] = None,
+    ):
+        self._add_callback_fn = add_object_out_of_scope_callback
+        # IDs of live blocks. Stale callbacks (fired after clear()) check
+        # membership here and no-op, preventing negative _bytes_by_producer.
         self._registered_ids: set[bytes] = set()
         # (producer_id -> total live bytes); maintained incrementally for O(1) reads.
         self._bytes_by_producer: Dict[str, int] = defaultdict(int)
@@ -33,6 +38,8 @@ def on_block_produced(
         Registers a Ray Core out-of-scope callback so that when all references
         to block_ref are gone the bytes are automatically removed from the
         producer's usage.
+
+        Not idempotent: calling twice with the same block_ref double-counts.
         """
         id_binary = block_ref.binary()
         with self._lock:
@@ -46,13 +53,15 @@ def _on_object_freed(id_bytes: bytes) -> None:
                     return
                 self._registered_ids.discard(id_bytes)
                 self._bytes_by_producer[producer_id] -= size_bytes
-                if self._bytes_by_producer[producer_id] == 0:
-                    del self._bytes_by_producer[producer_id]
 
-        core_worker = ray._private.worker.global_worker.core_worker  # type: ignore[attr-defined]
-        registered = core_worker.add_object_out_of_scope_callback(
-            block_ref, _on_object_freed
-        )
+        add_callback = self._add_callback_fn
+        if add_callback is None:
+            import ray._private.worker
+
+            core_worker = ray._private.worker.global_worker.core_worker  # type: ignore[attr-defined]
+            add_callback = core_worker.add_object_out_of_scope_callback
+
+        registered = add_callback(block_ref, _on_object_freed)
         if not registered:
             _on_object_freed(id_binary)
 
diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index 4cf9caed7809..e22007f75879 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -1,261 +1,105 @@
 import gc
-import threading
 import time
-import unittest.mock as mock
 
+import numpy as np
 import pytest
 
 import ray
+from ray._common.test_utils import wait_for_condition
 from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+from ray.data._internal.util import MiB
 from ray.tests.conftest import *  # noqa
 
-
-class _FakeRef:
-    """Minimal stand-in for ray.ObjectRef. Has a .binary() that returns bytes."""
-
-    def __init__(self, uid: int):
-        self._binary = uid.to_bytes(28, "big")
-
-    def binary(self) -> bytes:
-        return self._binary
-
-
-def _register_block(counter, ref, size_bytes, producer_id):
-    """Call on_block_produced on an existing counter with a mocked core worker.
-
-    Returns the captured _on_out_of_scope callback so tests can fire it directly.
-    """
-    captured_callback = None
-
-    class _MockCoreWorker:
-        def add_object_out_of_scope_callback(self, block_ref, cb):
-            nonlocal captured_callback
-            captured_callback = cb
-            return True
-
-    with mock.patch(
-        "ray._private.worker.global_worker",
-        mock.Mock(core_worker=_MockCoreWorker()),
-    ):
-        counter.on_block_produced(ref, size_bytes, producer_id)
-
-    return captured_callback, ref.binary()
-
-
-class TestBlockRefCounterAccounting:
-    def test_single_block_produced_and_released(self):
-        counter = BlockRefCounter()
-        ref = _FakeRef(1)
-        callback, id_binary = _register_block(counter, ref, 100, "op_a")
-
-        assert counter.get_object_store_memory_usage("op_a") == 100
-        callback(id_binary)
-        assert counter.get_object_store_memory_usage("op_a") == 0
-
-    def test_multiple_blocks_same_producer(self):
-        counter = BlockRefCounter()
-        ref1, ref2 = _FakeRef(1), _FakeRef(2)
-        cb1, bin1 = _register_block(counter, ref1, 100, "op_a")
-        cb2, bin2 = _register_block(counter, ref2, 200, "op_a")
-
-        assert counter.get_object_store_memory_usage("op_a") == 300
-        cb1(bin1)
-        assert counter.get_object_store_memory_usage("op_a") == 200
-        cb2(bin2)
-        assert counter.get_object_store_memory_usage("op_a") == 0
-
-    def test_multiple_producers_isolated(self):
-        counter = BlockRefCounter()
-        ref1, ref2 = _FakeRef(1), _FakeRef(2)
-        cb1, bin1 = _register_block(counter, ref1, 100, "op_a")
-        _register_block(counter, ref2, 200, "op_b")
-
-        assert counter.get_object_store_memory_usage("op_a") == 100
-        assert counter.get_object_store_memory_usage("op_b") == 200
-
-        cb1(bin1)
-        assert counter.get_object_store_memory_usage("op_a") == 0
-        assert counter.get_object_store_memory_usage("op_b") == 200
-
-
-class TestBlockRefCounterClear:
-    def test_clear_resets_usage(self):
-        counter = BlockRefCounter()
-        _register_block(counter, _FakeRef(1), 100, "op_a")
-        assert counter.get_object_store_memory_usage("op_a") == 100
-
-        counter.clear()
-        assert counter.get_object_store_memory_usage("op_a") == 0
-
-    def test_callback_after_clear_is_noop(self):
-        """A callback firing after clear() must not crash or corrupt state."""
-        counter = BlockRefCounter()
-        ref = _FakeRef(1)
-        callback, id_binary = _register_block(counter, ref, 100, "op_a")
-
-        counter.clear()
-        callback(id_binary)  # must be a silent no-op
-        assert counter.get_object_store_memory_usage("op_a") == 0
-
-    def test_new_blocks_after_clear_are_tracked(self):
-        """After clear(), new registrations work normally."""
-        counter = BlockRefCounter()
-        _register_block(counter, _FakeRef(1), 50, "op_b")
-        counter.clear()
-        assert counter.get_object_store_memory_usage("op_b") == 0
-
-        _register_block(counter, _FakeRef(2), 50, "op_b")
-        assert counter.get_object_store_memory_usage("op_b") == 50
-
-    def test_clear_races_with_object_already_freed(self):
-        """clear() between byte-increment and the registered=False undo must not go negative.
-
-        If add_object_out_of_scope_callback returns False (object already gone),
-        on_block_produced calls _on_object_freed to undo the increment. If clear()
-        fires in that window, the undo must be a no-op (id_binary is no longer in
-        _registered_ids), not a double-decrement.
-        """
-        counter = BlockRefCounter()
-        ref = _FakeRef(1)
-
-        class _ClearOnRegisterCoreWorker:
-            def add_object_out_of_scope_callback(self, block_ref, cb):
-                counter.clear()  # race: clear fires before finally runs
-                return False  # object already out of scope
-
-        with mock.patch(
-            "ray._private.worker.global_worker",
-            mock.Mock(core_worker=_ClearOnRegisterCoreWorker()),
-        ):
-            counter.on_block_produced(ref, 100, "op_a")
-
-        assert counter.get_object_store_memory_usage("op_a") == 0
-
-
-class TestBlockRefCounterThreadSafety:
-    def test_concurrent_callbacks_dont_corrupt_state(self):
-        """Multiple threads firing callbacks concurrently must not go negative."""
-        counter = BlockRefCounter()
-        producer_id = "op_concurrent"
-        n = 50
-        refs = [_FakeRef(i) for i in range(n)]
-        callbacks = []
-
-        for ref in refs:
-            cb, id_binary = _register_block(counter, ref, 10, producer_id)
-            callbacks.append((cb, id_binary))
-
-        threads = [threading.Thread(target=cb, args=(idb,)) for cb, idb in callbacks]
-        for t in threads:
-            t.start()
-        for t in threads:
-            t.join()
-
-        assert counter.get_object_store_memory_usage(producer_id) == 0
+# Grace period for asserting a callback has NOT fired. Must be shorter than
+# the task sleep in test_task_ref_keeps_counter_alive (1.0s); 0.3s leaves
+# wide margin even on slow CI while still surfacing early-fire bugs.
+_EARLY_FIRE_GRACE_S = 0.3
 
 
 @ray.remote
 def _hold_ref_for(block_ref, sleep_s: float) -> bool:
     """Hold *block_ref* as a task argument for *sleep_s* seconds, then return.
 
-    Because Ray keeps the object alive for the duration of any task that
-    received it as an argument, this lets tests verify the callback has
-    not fired while the task is still running.
+    Ray keeps an object alive for the duration of any task that received it as
+    an argument, so this lets tests assert the callback has not fired while the
+    task is still running.
     """
-    import time as _time
-
-    _time.sleep(sleep_s)
+    time.sleep(sleep_s)
     return True
 
 
-def _wait_for_counter(
-    counter: BlockRefCounter,
-    producer_id: str,
-    expected: int,
-    timeout_s: float = 10.0,
-    poll_interval_s: float = 0.05,
-) -> bool:
-    """Poll until *counter* reports *expected* bytes for *producer_id*.
+@pytest.fixture(params=["inlined", "regular"])
+def make_block(request):
+    """Factory for a block ObjectRef, parametrized over the two storage paths.
 
-    Calls ``gc.collect()`` on every iteration so that any pending Python-level
-    ObjectRef destructors have a chance to run.  Returns True if the expected
-    value is reached before *timeout_s* elapses, False otherwise.
+    Ray Core inlines tiny objects in the in-process store and puts larger ones
+    in the shared-memory object store; the out-of-scope callback must work for
+    both. Returning a factory (rather than the ref itself) avoids pytest holding
+    an extra reference that would keep the object alive past the test's own ``del``.
     """
-    deadline = time.monotonic() + timeout_s
-    while time.monotonic() < deadline:
-        gc.collect()
-        if counter.get_object_store_memory_usage(producer_id) == expected:
-            return True
-        time.sleep(poll_interval_s)
-    return False
 
+    def _make() -> "ray.ObjectRef":
+        if request.param == "inlined":
+            return ray.put(0)
+        return ray.put(np.zeros(1 * MiB, dtype=np.uint8))
 
-class TestBlockRefCounterLifecycle:
-    """Integration tests that exercise the full add_object_out_of_scope_callback path.
-
-    All tests in this class require a live Ray cluster (ray_start_regular_shared).
-    They verify that the out-of-scope callback fires at exactly the right moment:
-    not before the last reference drops, and not after it.
-
-    Three cases are covered:
-      1. Basic lifecycle: callback fires after the last Python ObjectRef is GC'd.
-      2. Two Python refs: callback fires only after both refs are dropped.
-      3. Task ref: callback fires only after the holding task finishes and all
-         Python refs are dropped. This matches the real operator lifecycle where
-         a block stays live until the task that received it as an argument completes.
+    return _make
+
+
+def _wait_for_counter(counter, producer_id, expected, timeout_s: float = 10.0):
+    """Wait until *counter* reports *expected* bytes for *producer_id*.
+
+    ``gc.collect()`` runs on each poll so any pending Python-level ObjectRef
+    destructors get a chance to run; the polling/timeout loop is delegated to
+    ``wait_for_condition`` (raises on timeout).
     """
 
-    # Byte count attributed to the test operator.  The actual object put into
-    # the store is much smaller; we only care that the counter tracks *this*
-    # number faithfully.
-    _SIZE_BYTES = 1 * 1024 * 1024  # 1 MB
+    def _reached():
+        gc.collect()
+        return counter.get_object_store_memory_usage(producer_id) == expected
 
-    def _make_block(self) -> "ray.ObjectRef":
-        import numpy as np
+    wait_for_condition(_reached, timeout=timeout_s)
 
-        return ray.put(np.zeros(128, dtype=np.float64))
 
+class TestBlockRefCounterLifecycle:
     def test_callback_fires_after_last_python_ref_deleted(
-        self, ray_start_regular_shared
+        self, ray_start_regular_shared, make_block
     ):
         """Counter reaches 0 once the only Python ObjectRef is GC'd."""
         counter = BlockRefCounter()
-        ref = self._make_block()
+        ref = make_block()
 
-        counter.on_block_produced(ref, self._SIZE_BYTES, "op_basic")
-        assert counter.get_object_store_memory_usage("op_basic") == self._SIZE_BYTES
+        counter.on_block_produced(ref, 1 * MiB, "op_basic")
+        assert counter.get_object_store_memory_usage("op_basic") == 1 * MiB
 
         del ref  # last Python ref gone
-        assert _wait_for_counter(counter, "op_basic", 0), (
-            "Counter did not reach 0 after all Python refs were deleted; "
-            f"remaining: {counter.get_object_store_memory_usage('op_basic')} bytes"
-        )
+        _wait_for_counter(counter, "op_basic", 0)
 
-    def test_second_python_ref_keeps_counter_alive(self, ray_start_regular_shared):
+    def test_second_python_ref_keeps_counter_alive(
+        self, ray_start_regular_shared, make_block
+    ):
         """Counter stays non-zero while a second Python ObjectRef is alive.
 
         Dropping one of two refs that point at the same ObjectID must NOT fire
         the callback. Only the final ref drop may do so.
         """
         counter = BlockRefCounter()
-        ref1 = self._make_block()
+        ref1 = make_block()
         ref2 = ref1  # second Python ref to the same ObjectID
 
-        counter.on_block_produced(ref1, self._SIZE_BYTES, "op_two_refs")
-        assert counter.get_object_store_memory_usage("op_two_refs") == self._SIZE_BYTES
+        counter.on_block_produced(ref1, 1 * MiB, "op_two_refs")
+        assert counter.get_object_store_memory_usage("op_two_refs") == 1 * MiB
 
         del ref1
         gc.collect()
-        time.sleep(0.3)  # give GC ample time; counter must still be non-zero
+        time.sleep(_EARLY_FIRE_GRACE_S)  # counter must still be non-zero
 
         assert (
-            counter.get_object_store_memory_usage("op_two_refs") == self._SIZE_BYTES
+            counter.get_object_store_memory_usage("op_two_refs") == 1 * MiB
         ), "Callback fired too early — counter decremented while ref2 was still alive"
 
         del ref2  # last ref gone; callback must now fire
-        assert _wait_for_counter(
-            counter, "op_two_refs", 0
-        ), "Counter did not reach 0 after the last Python ref was deleted"
+        _wait_for_counter(counter, "op_two_refs", 0)
 
     def test_task_ref_keeps_counter_alive_until_task_completes(
         self, ray_start_regular_shared
@@ -263,35 +107,35 @@ def test_task_ref_keeps_counter_alive_until_task_completes(
         """Counter stays non-zero while a running Ray task holds the block.
 
         Ray keeps any object alive for the duration of a task that received it
-        as an argument.  The callback should not fire until both conditions hold:
+        as an argument. The callback should not fire until both conditions hold:
         (a) the task has completed, and (b) all Python refs are dropped.
+
+        Uses a plasma (by-reference) object specifically: tiny objects are
+        inlined into the task by value, so they would not get a task-argument
+        reference and this lifetime-extension behavior would not apply.
         """
         counter = BlockRefCounter()
-        ref = self._make_block()
+        ref = ray.put(np.zeros(1 * MiB, dtype=np.uint8))
 
-        counter.on_block_produced(ref, self._SIZE_BYTES, "op_task")
-        assert counter.get_object_store_memory_usage("op_task") == self._SIZE_BYTES
+        counter.on_block_produced(ref, 1 * MiB, "op_task")
+        assert counter.get_object_store_memory_usage("op_task") == 1 * MiB
 
-        # Submit a task that sleeps for 1 s while holding the block, then drop
-        # the Python ref so only the task's argument reference remains.
+        # Submit a task that sleeps while holding the block, then drop the Python
+        # ref so only the task's argument reference remains.
         task_future = _hold_ref_for.remote(ref, 1.0)
         del ref
         gc.collect()
-        time.sleep(0.3)  # task is still running; callback must NOT have fired
+        time.sleep(_EARLY_FIRE_GRACE_S)  # task still running; callback must NOT fire
 
         assert (
-            counter.get_object_store_memory_usage("op_task") == self._SIZE_BYTES
+            counter.get_object_store_memory_usage("op_task") == 1 * MiB
         ), "Callback fired too early: counter decremented while task was still running"
 
         ray.get(task_future)  # task completes; now both refs are gone
-        assert _wait_for_counter(
-            counter, "op_task", 0
-        ), "Counter did not reach 0 after task completed and Python ref was deleted"
+        _wait_for_counter(counter, "op_task", 0)
 
 
 if __name__ == "__main__":
     import sys
 
-    import pytest
-
     sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/data/tests/unit/test_block_ref_counter.py b/python/ray/data/tests/unit/test_block_ref_counter.py
new file mode 100644
index 000000000000..08f3f83e3431
--- /dev/null
+++ b/python/ray/data/tests/unit/test_block_ref_counter.py
@@ -0,0 +1,140 @@
+import threading
+from typing import Callable, Dict
+
+import pytest
+
+import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+
+
+def _ref(uid: int) -> "ray.ObjectRef":
+    """Real ObjectRef with a deterministic distinct 28-byte ID, no Ray cluster needed."""
+    return ray.ObjectRef(uid.to_bytes(28, "big"))
+
+
+class FakeAddObjectOutOfScopeCallback:
+    """Test double for CoreWorker.add_object_out_of_scope_callback.
+
+    Records each registered callback keyed by the block's object-ID bytes so a
+    test can fire it explicitly. Set registered=False to simulate an object
+    that is already out of scope at registration time.
+    """
+
+    def __init__(self, registered: bool = True):
+        self._registered = registered
+        self._callbacks: Dict[bytes, Callable[[bytes], None]] = {}
+
+    def __call__(
+        self, object_ref: "ray.ObjectRef", callback: Callable[[bytes], None]
+    ) -> bool:
+        if self._registered:
+            self._callbacks[object_ref.binary()] = callback
+        return self._registered
+
+    def fire(self, object_ref: "ray.ObjectRef") -> None:
+        id_binary = object_ref.binary()
+        self._callbacks[id_binary](id_binary)
+
+
+class TestBlockRefCounterAccounting:
+    def test_single_block_produced_and_released(self):
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        ref = _ref(1)
+
+        counter.on_block_produced(ref, 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 1
+
+        add_cb.fire(ref)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_multiple_blocks_same_producer(self):
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        ref1, ref2 = _ref(1), _ref(2)
+
+        counter.on_block_produced(ref1, 1, "op_a")
+        counter.on_block_produced(ref2, 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 2
+
+        add_cb.fire(ref1)
+        assert counter.get_object_store_memory_usage("op_a") == 1
+        add_cb.fire(ref2)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_multiple_producers_isolated(self):
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        ref1, ref2 = _ref(1), _ref(2)
+
+        counter.on_block_produced(ref1, 1, "op_a")
+        counter.on_block_produced(ref2, 1, "op_b")
+        assert counter.get_object_store_memory_usage("op_a") == 1
+        assert counter.get_object_store_memory_usage("op_b") == 1
+
+        add_cb.fire(ref1)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+        assert counter.get_object_store_memory_usage("op_b") == 1
+
+    def test_register_when_object_already_out_of_scope(self):
+        """If registration reports the object is already gone, the increment is
+        undone immediately so the producer nets to zero."""
+        add_cb = FakeAddObjectOutOfScopeCallback(registered=False)
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+
+        counter.on_block_produced(_ref(1), 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+
+class TestBlockRefCounterClear:
+    def test_clear_resets_usage(self):
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        counter.on_block_produced(_ref(1), 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 1
+
+        counter.clear()
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
+    def test_stale_callback_after_clear_is_noop(self):
+        """A stale callback firing after clear() must not touch accounting
+        recorded after the reset."""
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        stale_ref = _ref(1)
+        counter.on_block_produced(stale_ref, 1, "op_a")
+
+        counter.clear()
+
+        counter.on_block_produced(_ref(2), 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 1
+
+        add_cb.fire(stale_ref)
+        assert counter.get_object_store_memory_usage("op_a") == 1
+
+
+class TestBlockRefCounterThreadSafety:
+    def test_concurrent_callbacks_dont_corrupt_state(self):
+        """Many threads firing callbacks at once must not corrupt the count."""
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        producer_id = "op_concurrent"
+        n = 50
+        refs = [_ref(i) for i in range(n)]
+        for ref in refs:
+            counter.on_block_produced(ref, 1, producer_id)
+        assert counter.get_object_store_memory_usage(producer_id) == n
+
+        threads = [threading.Thread(target=add_cb.fire, args=(ref,)) for ref in refs]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert counter.get_object_store_memory_usage(producer_id) == 0
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))

From 6b4ca47cc712244aa07f11f8d3b90581555a75ed Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 14:54:56 -0700
Subject: [PATCH 26/53] Make on_block_produced idempotent

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/block_ref_counter.py     |  4 +++-
 .../data/tests/unit/test_block_ref_counter.py    | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
index 148b8f4a7bdb..9fd21704f13e 100644
--- a/python/ray/data/_internal/execution/block_ref_counter.py
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -39,10 +39,12 @@ def on_block_produced(
         to block_ref are gone the bytes are automatically removed from the
         producer's usage.
 
-        Not idempotent: calling twice with the same block_ref double-counts.
+        Idempotent: calling twice with the same block_ref is a no-op.
         """
         id_binary = block_ref.binary()
         with self._lock:
+            if id_binary in self._registered_ids:
+                return
             self._registered_ids.add(id_binary)
             self._bytes_by_producer[producer_id] += size_bytes
 
diff --git a/python/ray/data/tests/unit/test_block_ref_counter.py b/python/ray/data/tests/unit/test_block_ref_counter.py
index 08f3f83e3431..6b2a6b752a19 100644
--- a/python/ray/data/tests/unit/test_block_ref_counter.py
+++ b/python/ray/data/tests/unit/test_block_ref_counter.py
@@ -76,6 +76,22 @@ def test_multiple_producers_isolated(self):
         assert counter.get_object_store_memory_usage("op_a") == 0
         assert counter.get_object_store_memory_usage("op_b") == 1
 
+    def test_duplicate_registration_is_noop(self):
+        """on_block_produced is idempotent: a duplicate ref is silently ignored.
+
+        This matters when an AllToAllOperator forwards an input ref unchanged;
+        the ref was already registered by the upstream producer."""
+        add_cb = FakeAddObjectOutOfScopeCallback()
+        counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
+        ref = _ref(1)
+
+        counter.on_block_produced(ref, 1, "op_a")
+        counter.on_block_produced(ref, 1, "op_a")
+        assert counter.get_object_store_memory_usage("op_a") == 1
+
+        add_cb.fire(ref)
+        assert counter.get_object_store_memory_usage("op_a") == 0
+
     def test_register_when_object_already_out_of_scope(self):
         """If registration reports the object is already gone, the increment is
         undone immediately so the producer nets to zero."""

From 05ab0e83a926ad10ed2dfd834b3d391ac5da20bd Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 19:16:26 -0700
Subject: [PATCH 27/53] Address comments

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/block_ref_counter.py  | 14 ++++----
 .../ray/data/tests/test_block_ref_counter.py  | 33 ++++++++++---------
 .../data/tests/unit/test_block_ref_counter.py | 12 +++----
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
index 9fd21704f13e..bd3b6c37c459 100644
--- a/python/ray/data/_internal/execution/block_ref_counter.py
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -3,6 +3,7 @@
 from typing import Callable, Dict, Optional
 
 import ray
+from ray._private.worker import global_worker
 
 
 class BlockRefCounter:
@@ -19,6 +20,10 @@ def __init__(
             Callable[["ray.ObjectRef", Callable[[bytes], None]], bool]
         ] = None,
     ):
+        if add_object_out_of_scope_callback is None:
+            add_object_out_of_scope_callback = (
+                global_worker.core_worker.add_object_out_of_scope_callback
+            )
         self._add_callback_fn = add_object_out_of_scope_callback
         # IDs of live blocks. Stale callbacks (fired after clear()) check
         # membership here and no-op, preventing negative _bytes_by_producer.
@@ -56,14 +61,7 @@ def _on_object_freed(id_bytes: bytes) -> None:
                 self._registered_ids.discard(id_bytes)
                 self._bytes_by_producer[producer_id] -= size_bytes
 
-        add_callback = self._add_callback_fn
-        if add_callback is None:
-            import ray._private.worker
-
-            core_worker = ray._private.worker.global_worker.core_worker  # type: ignore[attr-defined]
-            add_callback = core_worker.add_object_out_of_scope_callback
-
-        registered = add_callback(block_ref, _on_object_freed)
+        registered = self._add_callback_fn(block_ref, _on_object_freed)
         if not registered:
             _on_object_freed(id_binary)
 
diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index e22007f75879..2f27ab6d3974 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -30,7 +30,8 @@ def _hold_ref_for(block_ref, sleep_s: float) -> bool:
 
 @pytest.fixture(params=["inlined", "regular"])
 def make_block(request):
-    """Factory for a block ObjectRef, parametrized over the two storage paths.
+    """Factory for a block (ObjectRef, size_bytes), parametrized over the two
+    storage paths.
 
     Ray Core inlines tiny objects in the in-process store and puts larger ones
     in the shared-memory object store; the out-of-scope callback must work for
@@ -38,15 +39,17 @@ def make_block(request):
     an extra reference that would keep the object alive past the test's own ``del``.
     """
 
-    def _make() -> "ray.ObjectRef":
+    def _make() -> tuple["ray.ObjectRef", int]:
         if request.param == "inlined":
-            return ray.put(0)
-        return ray.put(np.zeros(1 * MiB, dtype=np.uint8))
+            data = np.zeros(1, dtype=np.uint8)
+        else:
+            data = np.zeros(1 * MiB, dtype=np.uint8)
+        return ray.put(data), len(data)
 
     return _make
 
 
-def _wait_for_counter(counter, producer_id, expected, timeout_s: float = 10.0):
+def _wait_for_counter(*, counter, producer_id, expected, timeout_s: float = 10.0):
     """Wait until *counter* reports *expected* bytes for *producer_id*.
 
     ``gc.collect()`` runs on each poll so any pending Python-level ObjectRef
@@ -67,13 +70,13 @@ def test_callback_fires_after_last_python_ref_deleted(
     ):
         """Counter reaches 0 once the only Python ObjectRef is GC'd."""
         counter = BlockRefCounter()
-        ref = make_block()
+        ref, size_bytes = make_block()
 
-        counter.on_block_produced(ref, 1 * MiB, "op_basic")
-        assert counter.get_object_store_memory_usage("op_basic") == 1 * MiB
+        counter.on_block_produced(ref, size_bytes, "op_basic")
+        assert counter.get_object_store_memory_usage("op_basic") == size_bytes
 
         del ref  # last Python ref gone
-        _wait_for_counter(counter, "op_basic", 0)
+        _wait_for_counter(counter=counter, producer_id="op_basic", expected=0)
 
     def test_second_python_ref_keeps_counter_alive(
         self, ray_start_regular_shared, make_block
@@ -84,22 +87,22 @@ def test_second_python_ref_keeps_counter_alive(
         the callback. Only the final ref drop may do so.
         """
         counter = BlockRefCounter()
-        ref1 = make_block()
+        ref1, size_bytes = make_block()
         ref2 = ref1  # second Python ref to the same ObjectID
 
-        counter.on_block_produced(ref1, 1 * MiB, "op_two_refs")
-        assert counter.get_object_store_memory_usage("op_two_refs") == 1 * MiB
+        counter.on_block_produced(ref1, size_bytes, "op_two_refs")
+        assert counter.get_object_store_memory_usage("op_two_refs") == size_bytes
 
         del ref1
         gc.collect()
         time.sleep(_EARLY_FIRE_GRACE_S)  # counter must still be non-zero
 
         assert (
-            counter.get_object_store_memory_usage("op_two_refs") == 1 * MiB
+            counter.get_object_store_memory_usage("op_two_refs") == size_bytes
         ), "Callback fired too early — counter decremented while ref2 was still alive"
 
         del ref2  # last ref gone; callback must now fire
-        _wait_for_counter(counter, "op_two_refs", 0)
+        _wait_for_counter(counter=counter, producer_id="op_two_refs", expected=0)
 
     def test_task_ref_keeps_counter_alive_until_task_completes(
         self, ray_start_regular_shared
@@ -132,7 +135,7 @@ def test_task_ref_keeps_counter_alive_until_task_completes(
         ), "Callback fired too early: counter decremented while task was still running"
 
         ray.get(task_future)  # task completes; now both refs are gone
-        _wait_for_counter(counter, "op_task", 0)
+        _wait_for_counter(counter=counter, producer_id="op_task", expected=0)
 
 
 if __name__ == "__main__":
diff --git a/python/ray/data/tests/unit/test_block_ref_counter.py b/python/ray/data/tests/unit/test_block_ref_counter.py
index 6b2a6b752a19..f1b01d575269 100644
--- a/python/ray/data/tests/unit/test_block_ref_counter.py
+++ b/python/ray/data/tests/unit/test_block_ref_counter.py
@@ -16,20 +16,20 @@ class FakeAddObjectOutOfScopeCallback:
     """Test double for CoreWorker.add_object_out_of_scope_callback.
 
     Records each registered callback keyed by the block's object-ID bytes so a
-    test can fire it explicitly. Set registered=False to simulate an object
+    test can fire it explicitly. Set should_registration_fail=True to simulate an object
     that is already out of scope at registration time.
     """
 
-    def __init__(self, registered: bool = True):
-        self._registered = registered
+    def __init__(self, should_registration_fail: bool = False):
+        self._should_fail = should_registration_fail
         self._callbacks: Dict[bytes, Callable[[bytes], None]] = {}
 
     def __call__(
         self, object_ref: "ray.ObjectRef", callback: Callable[[bytes], None]
     ) -> bool:
-        if self._registered:
+        if not self._should_fail:
             self._callbacks[object_ref.binary()] = callback
-        return self._registered
+        return not self._should_fail
 
     def fire(self, object_ref: "ray.ObjectRef") -> None:
         id_binary = object_ref.binary()
@@ -95,7 +95,7 @@ def test_duplicate_registration_is_noop(self):
     def test_register_when_object_already_out_of_scope(self):
         """If registration reports the object is already gone, the increment is
         undone immediately so the producer nets to zero."""
-        add_cb = FakeAddObjectOutOfScopeCallback(registered=False)
+        add_cb = FakeAddObjectOutOfScopeCallback(should_registration_fail=True)
         counter = BlockRefCounter(add_object_out_of_scope_callback=add_cb)
 
         counter.on_block_produced(_ref(1), 1, "op_a")

From b56551dd874c155e1eb0be8598191ff44db06440 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 20:08:33 -0700
Subject: [PATCH 28/53] Pyrefly fixes + move benchmark

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/block_ref_counter.py  |   2 +-
 .../ray/data/tests/test_block_ref_counter.py  |   4 +-
 .../dataset/callback_latency_benchmark.py     | 101 ++++++++++++++++++
 release/release_data_tests.yaml               |  10 ++
 4 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100644 release/nightly_tests/dataset/callback_latency_benchmark.py

diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
index bd3b6c37c459..20b05c9d26b6 100644
--- a/python/ray/data/_internal/execution/block_ref_counter.py
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -22,7 +22,7 @@ def __init__(
     ):
         if add_object_out_of_scope_callback is None:
             add_object_out_of_scope_callback = (
-                global_worker.core_worker.add_object_out_of_scope_callback
+                global_worker.core_worker.add_object_out_of_scope_callback  # pyrefly: ignore[missing-attribute]
             )
         self._add_callback_fn = add_object_out_of_scope_callback
         # IDs of live blocks. Stale callbacks (fired after clear()) check
diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index 2f27ab6d3974..b4d14eee499d 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -118,7 +118,9 @@ def test_task_ref_keeps_counter_alive_until_task_completes(
         reference and this lifetime-extension behavior would not apply.
         """
         counter = BlockRefCounter()
-        ref = ray.put(np.zeros(1 * MiB, dtype=np.uint8))
+        ref = ray.put(
+            np.zeros(1 * MiB, dtype=np.uint8)
+        )  # pyrefly: ignore[bad-argument-type]
 
         counter.on_block_produced(ref, 1 * MiB, "op_task")
         assert counter.get_object_store_memory_usage("op_task") == 1 * MiB
diff --git a/release/nightly_tests/dataset/callback_latency_benchmark.py b/release/nightly_tests/dataset/callback_latency_benchmark.py
new file mode 100644
index 000000000000..109bf58586c9
--- /dev/null
+++ b/release/nightly_tests/dataset/callback_latency_benchmark.py
@@ -0,0 +1,101 @@
+import argparse
+import gc
+import time
+
+import ray
+
+from benchmark import Benchmark
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+
+_SETTLE_TIMEOUT_S = 30.0
+
+
+@ray.remote(num_cpus=1)
+def _hold_block(block_ref, sleep_s: float) -> None:
+    """Hold block_ref as a task argument for sleep_s seconds, then return."""
+    time.sleep(sleep_s)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="BlockRefCounter convergence latency benchmark"
+    )
+    parser.add_argument(
+        "--num-tasks",
+        type=int,
+        default=100,
+        help="Number of concurrent tasks / blocks in the burst. "
+        "Latency scales with this value (ReferenceCounter mutex serializes callbacks); "
+        "increase together with cluster size for stress testing.",
+    )
+    parser.add_argument(
+        "--task-sleep-s",
+        type=float,
+        default=0.1,
+        help="How long each task holds its block before completing.",
+    )
+    return parser.parse_args()
+
+
+def run_burst_object_free(args: argparse.Namespace) -> dict:
+    n = args.num_tasks
+    producer_id = "burst_producer"
+    counter = BlockRefCounter()
+
+    # Object size is irrelevant: we measure callback timing, not memory accuracy.
+    refs = [ray.put(b"block") for _ in range(n)]
+    for ref in refs:
+        counter.on_block_produced(ref, 1, producer_id)
+
+    # Submit tasks that each hold one block, then drop Python refs.
+    # SPREAD distributes tasks across nodes to simulate a multi-node burst.
+    task_futures = [
+        _hold_block.options(scheduling_strategy="SPREAD").remote(ref, args.task_sleep_s)
+        for ref in refs
+    ]
+    # Drop Python refs so each block is kept alive solely by its task argument.
+    # The callback can only fire once the task completes AND this ref is gone.
+    del refs
+    gc.collect()
+
+    # T0 is stamped after all tasks finish so settle_time_s measures only the
+    # callback firing lag, not task execution time.
+    ray.get(task_futures)
+    t0 = time.perf_counter()
+    gc.collect()
+
+    negative_events = 0
+    converged = False
+    while time.perf_counter() - t0 < _SETTLE_TIMEOUT_S:
+        gc.collect()
+        usage = counter.get_object_store_memory_usage(producer_id)
+        if usage < 0:
+            negative_events += 1
+        if usage <= 0:
+            converged = True
+            break
+        time.sleep(0.05)
+
+    settle_time_s = time.perf_counter() - t0
+
+    assert negative_events == 0, f"Counter went negative {negative_events} times"
+    assert converged, (
+        f"Counter did not reach 0 within {_SETTLE_TIMEOUT_S}s; "
+        f"remaining: {counter.get_object_store_memory_usage(producer_id)} bytes"
+    )
+
+    return {
+        "callback_settle_time_s": round(settle_time_s, 4),
+        "num_tasks": n,
+        "counter_negative_events": negative_events,
+    }
+
+
+def main(args: argparse.Namespace):
+    benchmark = Benchmark()
+    benchmark.run_fn("burst-object-free", run_burst_object_free, args)
+    benchmark.write_result()
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml
index 88d84781f895..19cbbb6dbcbf 100644
--- a/release/release_data_tests.yaml
+++ b/release/release_data_tests.yaml
@@ -750,6 +750,16 @@
       run:
         script: python backpressure_benchmark.py --case training-prefetch --num-trainers 1
 
+- name: block_ref_counter_convergence
+  python: "3.10"
+  cluster:
+    anyscale_sdk_2026: true
+    cluster_compute: fixed_size_100_cpu_compute.yaml
+
+  run:
+    timeout: 300
+    script: python callback_latency_benchmark.py
+
 
 ########################
 # Sort and shuffle tests

From e833b1ad871d4e71cabfe5300f24833e7be4bbba Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 20:50:59 -0700
Subject: [PATCH 29/53] Fix pyrefly again

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_block_ref_counter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index b4d14eee499d..6c17439db50e 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -119,8 +119,8 @@ def test_task_ref_keeps_counter_alive_until_task_completes(
         """
         counter = BlockRefCounter()
         ref = ray.put(
-            np.zeros(1 * MiB, dtype=np.uint8)
-        )  # pyrefly: ignore[bad-argument-type]
+            np.zeros(1 * MiB, dtype=np.uint8)  # pyrefly: ignore[bad-argument-type]
+        )
 
         counter.on_block_produced(ref, 1 * MiB, "op_task")
         assert counter.get_object_store_memory_usage("op_task") == 1 * MiB

From a1f3a26a57ac6a49370159c09a921470ce7f058e Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 25 Jun 2026 11:29:57 -0700
Subject: [PATCH 30/53] Address edge case of unowned blocks in SplitCoordinator

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/block_ref_counter.py  |  7 ++++-
 .../ray/data/tests/test_block_ref_counter.py  | 26 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/_internal/execution/block_ref_counter.py b/python/ray/data/_internal/execution/block_ref_counter.py
index 20b05c9d26b6..7c73022e4877 100644
--- a/python/ray/data/_internal/execution/block_ref_counter.py
+++ b/python/ray/data/_internal/execution/block_ref_counter.py
@@ -61,7 +61,12 @@ def _on_object_freed(id_bytes: bytes) -> None:
                 self._registered_ids.discard(id_bytes)
                 self._bytes_by_producer[producer_id] -= size_bytes
 
-        registered = self._add_callback_fn(block_ref, _on_object_freed)
+        try:
+            registered = self._add_callback_fn(block_ref, _on_object_freed)
+        except ValueError:
+            # Block not owned by this worker; can't track it.
+            _on_object_freed(id_binary)
+            return
         if not registered:
             _on_object_freed(id_binary)
 
diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index 6c17439db50e..e775aabb3430 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -140,6 +140,32 @@ def test_task_ref_keeps_counter_alive_until_task_completes(
         _wait_for_counter(counter=counter, producer_id="op_task", expected=0)
 
 
+class TestBlockRefCounterNonOwnedBlocks:
+    def test_non_owned_block_not_tracked(self, ray_start_regular_shared):
+        """on_block_produced gracefully skips blocks not owned by this worker.
+
+        This reproduces the SplitCoordinator scenario where an actor-hosted
+        executor receives driver-owned blocks and calls on_block_produced on
+        them. The callback registration raises ValueError because the actor
+        does not own the blocks. The counter must handle this without crashing
+        and must not count the block's bytes.
+        """
+
+        @ray.remote
+        class CounterActor:
+            def try_track_non_owned_block(self, block_ref, size_bytes):
+                counter = BlockRefCounter()
+                counter.on_block_produced(block_ref, size_bytes, "op_split")
+                return counter.get_object_store_memory_usage("op_split")
+
+        block_ref = ray.put(np.zeros(1024, dtype=np.uint8))
+        actor = CounterActor.remote()
+        usage = ray.get(actor.try_track_non_owned_block.remote(block_ref, 1024))
+        assert (
+            usage == 0
+        ), f"Non-owned block should not be tracked, but got {usage} bytes"
+
+
 if __name__ == "__main__":
     import sys
 

From b2181a5f924ffbac697ca095e40e68000b9ba154 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 14:30:42 -0700
Subject: [PATCH 31/53] wrap block_ref in counter actor

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_block_ref_counter.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index e775aabb3430..dcaea06fe507 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -153,14 +153,19 @@ def test_non_owned_block_not_tracked(self, ray_start_regular_shared):
 
         @ray.remote
         class CounterActor:
-            def try_track_non_owned_block(self, block_ref, size_bytes):
+            def try_track_non_owned_block(self, block_ref_wrapped, size_bytes):
+                # Unwrap: ObjectRef was wrapped in a list to prevent
+                # Ray from auto-resolving it into the value.
+                block_ref = block_ref_wrapped[0]
                 counter = BlockRefCounter()
                 counter.on_block_produced(block_ref, size_bytes, "op_split")
                 return counter.get_object_store_memory_usage("op_split")
 
-        block_ref = ray.put(np.zeros(1024, dtype=np.uint8))
-        actor = CounterActor.remote()
-        usage = ray.get(actor.try_track_non_owned_block.remote(block_ref, 1024))
+        block_ref = ray.put(
+            np.zeros(1, dtype=np.uint8)  # pyrefly: ignore[bad-argument-type]
+        )
+        actor = CounterActor.remote()  # pyrefly: ignore[missing-attribute]
+        usage = ray.get(actor.try_track_non_owned_block.remote([block_ref], 1))
         assert (
             usage == 0
         ), f"Non-owned block should not be tracked, but got {usage} bytes"

From 39775f74cb4cc595f9e9fac47fab226677eb0c92 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 14:32:40 -0700
Subject: [PATCH 32/53] Comments improvement

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_block_ref_counter.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/ray/data/tests/test_block_ref_counter.py b/python/ray/data/tests/test_block_ref_counter.py
index dcaea06fe507..9e70f1cf7ba7 100644
--- a/python/ray/data/tests/test_block_ref_counter.py
+++ b/python/ray/data/tests/test_block_ref_counter.py
@@ -144,18 +144,19 @@ class TestBlockRefCounterNonOwnedBlocks:
     def test_non_owned_block_not_tracked(self, ray_start_regular_shared):
         """on_block_produced gracefully skips blocks not owned by this worker.
 
-        This reproduces the SplitCoordinator scenario where an actor-hosted
-        executor receives driver-owned blocks and calls on_block_produced on
-        them. The callback registration raises ValueError because the actor
-        does not own the blocks. The counter must handle this without crashing
-        and must not count the block's bytes.
+        When SplitCoordinator runs a streaming executor inside an actor, the
+        input blocks are owned by the driver. The actor cannot register
+        out-of-scope callbacks on them, so on_block_produced should catch the
+        ValueError and leave the usage at zero.
+
+        We wrap the ObjectRef in a list to prevent Ray from auto-resolving it,
+        mirroring how SplitCoordinator receives refs embedded in a serialized
+        Dataset object.
         """
 
         @ray.remote
         class CounterActor:
             def try_track_non_owned_block(self, block_ref_wrapped, size_bytes):
-                # Unwrap: ObjectRef was wrapped in a list to prevent
-                # Ray from auto-resolving it into the value.
                 block_ref = block_ref_wrapped[0]
                 counter = BlockRefCounter()
                 counter.on_block_produced(block_ref, size_bytes, "op_split")

From b20198b64decd239020fa14f97f1586cda43aa39 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 29 Jun 2026 13:52:44 -0700
Subject: [PATCH 33/53] Remove data side latency benchmark since core benchmark
 captures this and other data tests already capture the object free burst
 issue

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../dataset/callback_latency_benchmark.py     | 101 ------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 release/nightly_tests/dataset/callback_latency_benchmark.py

diff --git a/release/nightly_tests/dataset/callback_latency_benchmark.py b/release/nightly_tests/dataset/callback_latency_benchmark.py
deleted file mode 100644
index 109bf58586c9..000000000000
--- a/release/nightly_tests/dataset/callback_latency_benchmark.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import argparse
-import gc
-import time
-
-import ray
-
-from benchmark import Benchmark
-from ray.data._internal.execution.block_ref_counter import BlockRefCounter
-
-_SETTLE_TIMEOUT_S = 30.0
-
-
-@ray.remote(num_cpus=1)
-def _hold_block(block_ref, sleep_s: float) -> None:
-    """Hold block_ref as a task argument for sleep_s seconds, then return."""
-    time.sleep(sleep_s)
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="BlockRefCounter convergence latency benchmark"
-    )
-    parser.add_argument(
-        "--num-tasks",
-        type=int,
-        default=100,
-        help="Number of concurrent tasks / blocks in the burst. "
-        "Latency scales with this value (ReferenceCounter mutex serializes callbacks); "
-        "increase together with cluster size for stress testing.",
-    )
-    parser.add_argument(
-        "--task-sleep-s",
-        type=float,
-        default=0.1,
-        help="How long each task holds its block before completing.",
-    )
-    return parser.parse_args()
-
-
-def run_burst_object_free(args: argparse.Namespace) -> dict:
-    n = args.num_tasks
-    producer_id = "burst_producer"
-    counter = BlockRefCounter()
-
-    # Object size is irrelevant: we measure callback timing, not memory accuracy.
-    refs = [ray.put(b"block") for _ in range(n)]
-    for ref in refs:
-        counter.on_block_produced(ref, 1, producer_id)
-
-    # Submit tasks that each hold one block, then drop Python refs.
-    # SPREAD distributes tasks across nodes to simulate a multi-node burst.
-    task_futures = [
-        _hold_block.options(scheduling_strategy="SPREAD").remote(ref, args.task_sleep_s)
-        for ref in refs
-    ]
-    # Drop Python refs so each block is kept alive solely by its task argument.
-    # The callback can only fire once the task completes AND this ref is gone.
-    del refs
-    gc.collect()
-
-    # T0 is stamped after all tasks finish so settle_time_s measures only the
-    # callback firing lag, not task execution time.
-    ray.get(task_futures)
-    t0 = time.perf_counter()
-    gc.collect()
-
-    negative_events = 0
-    converged = False
-    while time.perf_counter() - t0 < _SETTLE_TIMEOUT_S:
-        gc.collect()
-        usage = counter.get_object_store_memory_usage(producer_id)
-        if usage < 0:
-            negative_events += 1
-        if usage <= 0:
-            converged = True
-            break
-        time.sleep(0.05)
-
-    settle_time_s = time.perf_counter() - t0
-
-    assert negative_events == 0, f"Counter went negative {negative_events} times"
-    assert converged, (
-        f"Counter did not reach 0 within {_SETTLE_TIMEOUT_S}s; "
-        f"remaining: {counter.get_object_store_memory_usage(producer_id)} bytes"
-    )
-
-    return {
-        "callback_settle_time_s": round(settle_time_s, 4),
-        "num_tasks": n,
-        "counter_negative_events": negative_events,
-    }
-
-
-def main(args: argparse.Namespace):
-    benchmark = Benchmark()
-    benchmark.run_fn("burst-object-free", run_burst_object_free, args)
-    benchmark.write_result()
-
-
-if __name__ == "__main__":
-    main(parse_args())

From c6f3c05f22665664a959bd617918d7875ef69ef4 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 29 Jun 2026 15:11:37 -0700
Subject: [PATCH 34/53] Unregister burst benchmark

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 release/release_data_tests.yaml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml
index 19cbbb6dbcbf..88d84781f895 100644
--- a/release/release_data_tests.yaml
+++ b/release/release_data_tests.yaml
@@ -750,16 +750,6 @@
       run:
         script: python backpressure_benchmark.py --case training-prefetch --num-trainers 1
 
-- name: block_ref_counter_convergence
-  python: "3.10"
-  cluster:
-    anyscale_sdk_2026: true
-    cluster_compute: fixed_size_100_cpu_compute.yaml
-
-  run:
-    timeout: 300
-    script: python callback_latency_benchmark.py
-
 
 ########################
 # Sort and shuffle tests

From 97eb0163c76d867bd37db71cdb8868f7e0492ae5 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 17 Jun 2026 15:17:48 -0700
Subject: [PATCH 35/53] Wire blockRefCounter through operators

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../execution/interfaces/physical_operator.py | 23 ++++++++++++++-
 .../operators/actor_pool_map_operator.py      |  4 +--
 .../operators/base_physical_operator.py       | 16 ++++++++++-
 .../execution/operators/hash_shuffle.py       |  6 ++--
 .../execution/operators/input_data_buffer.py  |  4 +--
 .../execution/operators/map_operator.py       | 12 +++++---
 .../execution/operators/output_splitter.py    |  4 +--
 .../execution/operators/union_operator.py     |  4 +--
 .../_internal/execution/resource_manager.py   |  8 ++++++
 .../_internal/execution/streaming_executor.py |  7 +++++
 .../execution/streaming_executor_state.py     |  1 -
 python/ray/data/tests/test_operators.py       |  2 ++
 .../ray/data/tests/test_streaming_executor.py | 28 +++++++++++++------
 13 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
index 545458506a4f..fc492400629b 100644
--- a/python/ray/data/_internal/execution/interfaces/physical_operator.py
+++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -24,6 +24,7 @@
     ActorPoolInfo,
     AutoscalingActorPool,
 )
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces.execution_options import (
     ExecutionOptions,
     ExecutionResources,
@@ -134,6 +135,8 @@ def __init__(
         self,
         task_index: int,
         streaming_gen: ObjectRefGenerator,
+        block_ref_counter: BlockRefCounter,
+        producer_id: str,
         output_ready_callback: Callable[[RefBundle], None] = lambda bundle: None,
         task_done_callback: TaskDoneCallbackType = lambda exc, worker_stats, driver_stats: None,
         block_ready_callback: Callable[
@@ -149,6 +152,9 @@ def __init__(
         Args:
             task_index: Index of the task. Used for callbacks.
             streaming_gen: The streaming generator of this task. It should yield blocks.
+            block_ref_counter: The centralized block reference counter. on_block_produced
+                is called for each block yielded by this task.
+            producer_id: The id of the operator that produces the blocks from this task.
             output_ready_callback: The callback to call when a new RefBundle is output
                 from the generator.
             task_done_callback: The callback to call when the task is done.
@@ -171,6 +177,8 @@ def __init__(
         self._block_ready_callback = block_ready_callback
         self._metadata_ready_callback = metadata_ready_callback
         self._operator_name = operator_name
+        self._block_ref_counter: BlockRefCounter = block_ref_counter
+        self._producer_id: str = producer_id
 
         # If the generator hasn't produced block metadata yet, or if the block metadata
         # object isn't available after we get a reference, we need store the pending
@@ -292,6 +300,9 @@ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
                 meta_with_schema_bytes
             )
             meta = meta_with_schema.metadata
+            self._block_ref_counter.on_block_produced(
+                self._pending_block_ref, meta.size_bytes or 0, self._producer_id
+            )
             self._output_ready_callback(
                 RefBundle(
                     [BlockEntry(self._pending_block_ref, meta)],
@@ -444,6 +455,7 @@ def __init__(
         self._id = str(uuid.uuid4())
         # Initialize metrics after data_context is set
         self._metrics = OpRuntimeMetrics(self)
+        self._block_ref_counter: Optional[BlockRefCounter] = None
 
     def __reduce__(self):
         raise ValueError("Operator is not serializable.")
@@ -743,12 +755,21 @@ def num_output_splits(self) -> int:
         """
         return self._num_output_splits
 
-    def start(self, options: ExecutionOptions) -> None:
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional[BlockRefCounter] = None,
+    ) -> None:
         """Called by the executor when execution starts for an operator.
 
         Args:
             options: The global options used for the overall execution.
+            block_ref_counter: The executor-wide shared counter for tracking
+                object-store memory. If omitted, a fresh per-operator counter is used.
         """
+        self._block_ref_counter = (
+            block_ref_counter if block_ref_counter is not None else BlockRefCounter()
+        )
         self._started = True
 
     def can_add_input(self) -> bool:
diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
index 5fe2f29f443f..de38d336c551 100644
--- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
+++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -265,9 +265,9 @@ def _apply_default_actor_task_remote_args(
 
         return ray_actor_task_remote_args
 
-    def start(self, options: ExecutionOptions):
+    def start(self, options: ExecutionOptions, block_ref_counter=None):
         self._actor_locality_enabled = options.actor_locality_enabled
-        super().start(options)
+        super().start(options, block_ref_counter)
 
         self._actor_cls = ray.remote(**self._ray_remote_args)(self._map_worker_cls)
         self._actor_pool.scale(
diff --git a/python/ray/data/_internal/execution/operators/base_physical_operator.py b/python/ray/data/_internal/execution/operators/base_physical_operator.py
index 388e2790608a..e48562a07e5c 100644
--- a/python/ray/data/_internal/execution/operators/base_physical_operator.py
+++ b/python/ray/data/_internal/execution/operators/base_physical_operator.py
@@ -183,9 +183,23 @@ def all_inputs_done(self) -> None:
         )
         # NOTE: We don't account object store memory use from intermediate `bulk_fn`
         # outputs (e.g., map outputs for map-reduce).
-        output_buffer, self._stats = self._bulk_fn(self._input_buffer.to_list(), ctx)
+
+        # Snapshot input refs before calling bulk_fn. Some bulk_fns (e.g.
+        # randomize_blocks) forward input ObjectRefs unchanged to the output.
+        # We only call on_block_produced for genuinely new refs to avoid
+        # double-counting; forwarded refs stay attributed to their original producer.
+        input_bundles = self._input_buffer.to_list()
+        input_refs = {entry.ref for bundle in input_bundles for entry in bundle.blocks}
+        output_buffer, self._stats = self._bulk_fn(input_bundles, ctx)
         self._output_buffer = FIFOBundleQueue(output_buffer)
 
+        for bundle in output_buffer:
+            for entry in bundle.blocks:
+                if entry.ref not in input_refs:
+                    self._block_ref_counter.on_block_produced(
+                        entry.ref, entry.metadata.size_bytes or 0, self.id
+                    )
+
         while self._input_buffer.has_next():
             refs = self._input_buffer.get_next()
             self._metrics.on_input_dequeued(refs, input_index=0)
diff --git a/python/ray/data/_internal/execution/operators/hash_shuffle.py b/python/ray/data/_internal/execution/operators/hash_shuffle.py
index 36c1303e449a..a895edebdcc7 100644
--- a/python/ray/data/_internal/execution/operators/hash_shuffle.py
+++ b/python/ray/data/_internal/execution/operators/hash_shuffle.py
@@ -673,8 +673,8 @@ def __init__(
         self._reduce_bar = None
         self._reduce_metrics = OpRuntimeMetrics(self)
 
-    def start(self, options: ExecutionOptions) -> None:
-        super().start(options)
+    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
+        super().start(options, block_ref_counter)
 
     @property
     def shuffle_name(self) -> str:
@@ -1196,6 +1196,8 @@ def _on_aggregation_done(
                     ExecutionResources.from_resource_dict(finalize_task_resource_bundle)
                 ),
                 operator_name=self.name,
+                block_ref_counter=self._block_ref_counter,
+                producer_id=self.id,
             )
             self._finalizing_tasks[partition_id] = data_task
 
diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py
index 66cdde25c81a..5b89ea02c7d3 100644
--- a/python/ray/data/_internal/execution/operators/input_data_buffer.py
+++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py
@@ -45,7 +45,7 @@ def __init__(
         self._input_data_index = 0
         self.mark_execution_finished()
 
-    def start(self, options: ExecutionOptions) -> None:
+    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
         if not self._is_input_initialized:
             self._input_data = self._input_data_factory(
                 self.target_max_block_size_override
@@ -57,7 +57,7 @@ def start(self, options: ExecutionOptions) -> None:
         # so we record input metrics here
         for bundle in self._input_data:
             self._metrics.on_input_received(bundle)
-        super().start(options)
+        super().start(options, block_ref_counter)
 
     def has_next(self) -> bool:
         return self._input_data_index < len(self._input_data)
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
index a4cb7aca5ed3..7e4a205def6f 100644
--- a/python/ray/data/_internal/execution/operators/map_operator.py
+++ b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -485,8 +485,8 @@ def create(
         else:
             raise ValueError(f"Unsupported execution strategy {compute_strategy}")
 
-    def start(self, options: "ExecutionOptions"):
-        super().start(options)
+    def start(self, options: "ExecutionOptions", block_ref_counter=None):
+        super().start(options, block_ref_counter)
         # Create output queue with desired ordering semantics.
         if options.preserve_order:
             self._output_queue = ReorderingBundleQueue()
@@ -659,8 +659,12 @@ def _task_done_callback(
         data_task = DataOpTask(
             task_index,
             gen,
-            lambda output: _output_ready_callback(task_index, output),
-            functools.partial(_task_done_callback, task_index),
+            self._block_ref_counter,
+            self.id,
+            output_ready_callback=lambda output: _output_ready_callback(
+                task_index, output
+            ),
+            task_done_callback=functools.partial(_task_done_callback, task_index),
             operator_name=self.name,
         )
         self._metrics.on_task_submitted(
diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py
index f436179a77be..b3fdc566970e 100644
--- a/python/ray/data/_internal/execution/operators/output_splitter.py
+++ b/python/ray/data/_internal/execution/operators/output_splitter.py
@@ -124,13 +124,13 @@ def num_output_rows_total(self) -> Optional[int]:
         # The total number of rows is the same as the number of input rows.
         return self.input_dependencies[0].num_output_rows_total()
 
-    def start(self, options: ExecutionOptions) -> None:
+    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
         if options.preserve_order:
             # If preserve_order is set, we need to ignore locality hints to ensure determinism.
             self._locality_hints = None
             self._max_buffer_size = 0
 
-        super().start(options)
+        super().start(options, block_ref_counter)
 
     def throttling_disabled(self) -> bool:
         """Disables resource-based throttling.
diff --git a/python/ray/data/_internal/execution/operators/union_operator.py b/python/ray/data/_internal/execution/operators/union_operator.py
index caf84f3d4a05..62a0f8be1ff4 100644
--- a/python/ray/data/_internal/execution/operators/union_operator.py
+++ b/python/ray/data/_internal/execution/operators/union_operator.py
@@ -59,12 +59,12 @@ def _input_queues(self) -> List["BaseBundleQueue"]:
     def _output_queues(self) -> List["BaseBundleQueue"]:
         return [self._output_buffer]
 
-    def start(self, options: ExecutionOptions):
+    def start(self, options: ExecutionOptions, block_ref_counter=None):
         # Whether to preserve deterministic ordering of output blocks.
         # When True, blocks are emitted in round-robin order across inputs,
         # ensuring the same input always produces the same output order.
         self._preserve_order = options.preserve_order
-        super().start(options)
+        super().start(options, block_ref_counter)
 
     def num_outputs_total(self) -> Optional[int]:
         num_outputs = 0
diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
index dec1597f8a54..50ff9f5a5c08 100644
--- a/python/ray/data/_internal/execution/resource_manager.py
+++ b/python/ray/data/_internal/execution/resource_manager.py
@@ -7,6 +7,7 @@
 
 from ray._common.utils import env_bool, env_float
 from ray.data._internal.execution import create_resource_allocator
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces.execution_options import (
     ExecutionOptions,
     ExecutionResources,
@@ -140,6 +141,8 @@ def __init__(
         # operator's output usage.
         self._output_operator = terminal_operator_from_topology(topology)
 
+        self._block_ref_counter = BlockRefCounter()
+
         self._op_resource_allocator: Optional[
             "OpResourceAllocator"
         ] = create_resource_allocator(self, data_context)
@@ -171,6 +174,11 @@ def get_external_consumer_bytes(self) -> int:
         """Get the bytes buffered by external consumers."""
         return self._external_consumer_bytes
 
+    @property
+    def block_ref_counter(self) -> BlockRefCounter:
+        """The centralized block reference counter for this executor."""
+        return self._block_ref_counter
+
     def _estimate_object_store_memory_usage(
         self, op: "PhysicalOperator", state: "OpState"
     ) -> int:
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
index abce1c902018..9390927e5875 100644
--- a/python/ray/data/_internal/execution/streaming_executor.py
+++ b/python/ray/data/_internal/execution/streaming_executor.py
@@ -228,6 +228,10 @@ def execute(
             self._data_context,
         )
 
+        counter = self._resource_manager.block_ref_counter
+        for op in self._topology:
+            op.start(self._options, counter)
+
         # Constructed once per executor (not per scheduling iteration) so the
         # guard's idle-detection state accumulates across scheduling iterations.
         self._output_backpressure_guard = OutputBackpressureGuard(
@@ -343,6 +347,9 @@ def shutdown(self, force: bool, exception: Optional[Exception] = None):
                 op.shutdown(timer, force=force)
 
             self._clear_topology_queues_post_shutdown(force, exception)
+            # Queues have been drained; any remaining Ray Core callbacks that fire
+            # after this point should be no-ops.
+            self._resource_manager.block_ref_counter.clear()
 
             min_ = round(timer.min(), 3)
             max_ = round(timer.max(), 3)
diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py
index dd68e6adcb27..1c691f13bf23 100644
--- a/python/ray/data/_internal/execution/streaming_executor_state.py
+++ b/python/ray/data/_internal/execution/streaming_executor_state.py
@@ -575,7 +575,6 @@ def setup_state(op: PhysicalOperator) -> OpState:
         # Create state.
         op_state = OpState(op, inqueues)
         topology[op] = op_state
-        op.start(options)
         return op_state
 
     setup_state(dag)
diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py
index d7ed2636b816..003142594387 100644
--- a/python/ray/data/tests/test_operators.py
+++ b/python/ray/data/tests/test_operators.py
@@ -182,6 +182,8 @@ def all_transform(bundles: List[RefBundle], ctx):
         DataContext.get_current().target_max_block_size,
     )
 
+    op1.start(ExecutionOptions())
+    op2.start(ExecutionOptions())
     while input_op.has_next():
         op1.add_input(input_op.get_next(), 0)
     op1.all_inputs_done()
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py
index d82ef7353088..5db06dfaf29f 100644
--- a/python/ray/data/tests/test_streaming_executor.py
+++ b/python/ray/data/tests/test_streaming_executor.py
@@ -23,6 +23,7 @@
 from ray.data._internal.execution.backpressure_policy.resource_budget_backpressure_policy import (
     ResourceBudgetBackpressurePolicy,
 )
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.execution_callback import ExecutionCallback
 from ray.data._internal.execution.interfaces import (
     ExecutionOptions,
@@ -1391,6 +1392,13 @@ def ensure_block_metadata_stored_in_plasma(monkeypatch):
     monkeypatch.setenv("RAY_max_direct_call_object_size", 0)
 
 
+def _make_data_op_task(task_index, streaming_gen, **kwargs):
+    """Create a DataOpTask with a default BlockRefCounter and producer_id for tests."""
+    kwargs.setdefault("block_ref_counter", BlockRefCounter())
+    kwargs.setdefault("producer_id", "test_op")
+    return DataOpTask(task_index, streaming_gen, **kwargs)
+
+
 class TestDataOpTask:
     def test_on_data_ready_single_output(self, ray_start_regular_shared):
         streaming_gen = create_stub_streaming_gen(block_nbytes=[128 * MiB])
@@ -1398,7 +1406,9 @@ def test_on_data_ready_single_output(self, ray_start_regular_shared):
         def verify_output(bundle):
             assert bundle.size_bytes() == pytest.approx(128 * MiB), bundle.size_bytes()
 
-        data_op_task = DataOpTask(0, streaming_gen, output_ready_callback=verify_output)
+        data_op_task = _make_data_op_task(
+            0, streaming_gen, output_ready_callback=verify_output
+        )
 
         bytes_read = 0
         while not data_op_task.has_finished:
@@ -1414,7 +1424,9 @@ def test_on_data_ready_multiple_outputs(self, ray_start_regular_shared):
         def verify_output(bundle):
             assert bundle.size_bytes() == pytest.approx(128 * MiB), bundle.size_bytes()
 
-        data_op_task = DataOpTask(0, streaming_gen, output_ready_callback=verify_output)
+        data_op_task = _make_data_op_task(
+            0, streaming_gen, output_ready_callback=verify_output
+        )
 
         bytes_read = 0
         while not data_op_task.has_finished:
@@ -1435,7 +1447,7 @@ def verify_exception(exc, task_exec_stats, task_exec_driver_stats):
             assert task_exec_stats is None
             assert task_exec_driver_stats is None
 
-        data_op_task = DataOpTask(
+        data_op_task = _make_data_op_task(
             0,
             streaming_gen,
             task_done_callback=verify_exception,
@@ -1448,11 +1460,11 @@ def verify_exception(exc, task_exec_stats, task_exec_driver_stats):
 
     def test_operator_name_parameter(self, ray_start_regular_shared):
         streaming_gen = create_stub_streaming_gen(block_nbytes=[1])
-        task = DataOpTask(0, streaming_gen, operator_name="MapBatches(fn)")
+        task = _make_data_op_task(0, streaming_gen, operator_name="MapBatches(fn)")
         assert task._operator_name == "MapBatches(fn)"
 
         streaming_gen2 = create_stub_streaming_gen(block_nbytes=[1])
-        task_default = DataOpTask(1, streaming_gen2)
+        task_default = _make_data_op_task(1, streaming_gen2)
         assert task_default._operator_name == "Unknown"
 
     @pytest.mark.parametrize(
@@ -1489,7 +1501,7 @@ def remove_and_add_back_worker_node(_):
             new_worker_node = cluster.add_node(num_cpus=1)  # noqa: F841
             cluster.wait_for_nodes()
 
-        data_op_task = DataOpTask(
+        data_op_task = _make_data_op_task(
             0, streaming_gen, **{preempt_on: remove_and_add_back_worker_node}
         )
 
@@ -1520,7 +1532,7 @@ def test_on_data_ready_with_preemption_after_wait(
 
         # Create a streaming generator that produces a single 128 MiB output block.
         streaming_gen = create_stub_streaming_gen(block_nbytes=[128 * MiB])
-        data_op_task = DataOpTask(0, streaming_gen)
+        data_op_task = _make_data_op_task(0, streaming_gen)
 
         # Wait for the block to be ready, then remove the worker node.
         ray.wait([streaming_gen], fetch_local=False)
@@ -1560,7 +1572,7 @@ def capture_done(exc, task_exec_stats, task_exec_driver_stats):
             captured_stats["task_exec_stats"] = task_exec_stats
             captured_stats["task_exec_driver_stats"] = task_exec_driver_stats
 
-        data_op_task = DataOpTask(
+        data_op_task = _make_data_op_task(
             0,
             streaming_gen,
             task_done_callback=capture_done,

From ef33d7bd3e620ed155f39d116540f51138cb4d7a Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 17 Jun 2026 17:12:23 -0700
Subject: [PATCH 36/53] Add missing type notations + missing hash shuffle
 change

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../execution/interfaces/physical_operator.py   | 17 ++++++++---------
 .../operators/actor_pool_map_operator.py        |  8 +++++++-
 .../operators/base_physical_operator.py         | 13 +++++++------
 .../execution/operators/hash_shuffle.py         |  7 ++++++-
 .../execution/operators/input_data_buffer.py    | 11 +++++++++--
 .../execution/operators/map_operator.py         |  8 +++++++-
 .../execution/operators/output_splitter.py      | 11 +++++++++--
 .../execution/operators/union_operator.py       | 11 +++++++++--
 .../data/_internal/gpu_shuffle/hash_shuffle.py  | 12 +++++++++---
 9 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
index fc492400629b..25f9034cadb4 100644
--- a/python/ray/data/_internal/execution/interfaces/physical_operator.py
+++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -135,7 +135,7 @@ def __init__(
         self,
         task_index: int,
         streaming_gen: ObjectRefGenerator,
-        block_ref_counter: BlockRefCounter,
+        block_ref_counter: Optional[BlockRefCounter],
         producer_id: str,
         output_ready_callback: Callable[[RefBundle], None] = lambda bundle: None,
         task_done_callback: TaskDoneCallbackType = lambda exc, worker_stats, driver_stats: None,
@@ -177,7 +177,7 @@ def __init__(
         self._block_ready_callback = block_ready_callback
         self._metadata_ready_callback = metadata_ready_callback
         self._operator_name = operator_name
-        self._block_ref_counter: BlockRefCounter = block_ref_counter
+        self._block_ref_counter: Optional[BlockRefCounter] = block_ref_counter
         self._producer_id: str = producer_id
 
         # If the generator hasn't produced block metadata yet, or if the block metadata
@@ -300,9 +300,10 @@ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
                 meta_with_schema_bytes
             )
             meta = meta_with_schema.metadata
-            self._block_ref_counter.on_block_produced(
-                self._pending_block_ref, meta.size_bytes or 0, self._producer_id
-            )
+            if self._block_ref_counter is not None:
+                self._block_ref_counter.on_block_produced(
+                    self._pending_block_ref, meta.size_bytes or 0, self._producer_id
+                )
             self._output_ready_callback(
                 RefBundle(
                     [BlockEntry(self._pending_block_ref, meta)],
@@ -765,11 +766,9 @@ def start(
         Args:
             options: The global options used for the overall execution.
             block_ref_counter: The executor-wide shared counter for tracking
-                object-store memory. If omitted, a fresh per-operator counter is used.
+                object-store memory.
         """
-        self._block_ref_counter = (
-            block_ref_counter if block_ref_counter is not None else BlockRefCounter()
-        )
+        self._block_ref_counter = block_ref_counter
         self._started = True
 
     def can_add_input(self) -> bool:
diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
index de38d336c551..f5b48b6db324 100644
--- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
+++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -23,6 +23,8 @@
 
 if TYPE_CHECKING:
     import pyarrow as pa
+
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 import ray
 from ray.actor import ActorHandle
 from ray.core.generated import gcs_pb2
@@ -265,7 +267,11 @@ def _apply_default_actor_task_remote_args(
 
         return ray_actor_task_remote_args
 
-    def start(self, options: ExecutionOptions, block_ref_counter=None):
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ):
         self._actor_locality_enabled = options.actor_locality_enabled
         super().start(options, block_ref_counter)
 
diff --git a/python/ray/data/_internal/execution/operators/base_physical_operator.py b/python/ray/data/_internal/execution/operators/base_physical_operator.py
index e48562a07e5c..f0220394d2c0 100644
--- a/python/ray/data/_internal/execution/operators/base_physical_operator.py
+++ b/python/ray/data/_internal/execution/operators/base_physical_operator.py
@@ -193,12 +193,13 @@ def all_inputs_done(self) -> None:
         output_buffer, self._stats = self._bulk_fn(input_bundles, ctx)
         self._output_buffer = FIFOBundleQueue(output_buffer)
 
-        for bundle in output_buffer:
-            for entry in bundle.blocks:
-                if entry.ref not in input_refs:
-                    self._block_ref_counter.on_block_produced(
-                        entry.ref, entry.metadata.size_bytes or 0, self.id
-                    )
+        if self._block_ref_counter is not None:
+            for bundle in output_buffer:
+                for entry in bundle.blocks:
+                    if entry.ref not in input_refs:
+                        self._block_ref_counter.on_block_produced(
+                            entry.ref, entry.metadata.size_bytes or 0, self.id
+                        )
 
         while self._input_buffer.has_next():
             refs = self._input_buffer.get_next()
diff --git a/python/ray/data/_internal/execution/operators/hash_shuffle.py b/python/ray/data/_internal/execution/operators/hash_shuffle.py
index a895edebdcc7..5f5a5b58c25b 100644
--- a/python/ray/data/_internal/execution/operators/hash_shuffle.py
+++ b/python/ray/data/_internal/execution/operators/hash_shuffle.py
@@ -80,6 +80,7 @@
 )
 
 if typing.TYPE_CHECKING:
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
     from ray.data._internal.progress.base_progress import BaseProgressBar
 
 logger = logging.getLogger(__name__)
@@ -673,7 +674,11 @@ def __init__(
         self._reduce_bar = None
         self._reduce_metrics = OpRuntimeMetrics(self)
 
-    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ) -> None:
         super().start(options, block_ref_counter)
 
     @property
diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py
index 5b89ea02c7d3..16bb37993f43 100644
--- a/python/ray/data/_internal/execution/operators/input_data_buffer.py
+++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py
@@ -1,4 +1,7 @@
-from typing import Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 
 from ray.data._internal.execution.interfaces import (
     ExecutionOptions,
@@ -45,7 +48,11 @@ def __init__(
         self._input_data_index = 0
         self.mark_execution_finished()
 
-    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ) -> None:
         if not self._is_input_initialized:
             self._input_data = self._input_data_factory(
                 self.target_max_block_size_override
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
index 7e4a205def6f..d80788632ecd 100644
--- a/python/ray/data/_internal/execution/operators/map_operator.py
+++ b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -29,6 +29,8 @@
 if TYPE_CHECKING:
     import pyarrow as pa
 
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+
 import ray
 from ray import ObjectRef
 from ray._raylet import ObjectRefGenerator
@@ -485,7 +487,11 @@ def create(
         else:
             raise ValueError(f"Unsupported execution strategy {compute_strategy}")
 
-    def start(self, options: "ExecutionOptions", block_ref_counter=None):
+    def start(
+        self,
+        options: "ExecutionOptions",
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ):
         super().start(options, block_ref_counter)
         # Create output queue with desired ordering semantics.
         if options.preserve_order:
diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py
index b3fdc566970e..5180506156a3 100644
--- a/python/ray/data/_internal/execution/operators/output_splitter.py
+++ b/python/ray/data/_internal/execution/operators/output_splitter.py
@@ -2,7 +2,10 @@
 import math
 import time
 from dataclasses import replace
-from typing import Any, Collection, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Collection, Dict, List, Optional, Tuple
+
+if TYPE_CHECKING:
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 
 from typing_extensions import override
 
@@ -124,7 +127,11 @@ def num_output_rows_total(self) -> Optional[int]:
         # The total number of rows is the same as the number of input rows.
         return self.input_dependencies[0].num_output_rows_total()
 
-    def start(self, options: ExecutionOptions, block_ref_counter=None) -> None:
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ) -> None:
         if options.preserve_order:
             # If preserve_order is set, we need to ignore locality hints to ensure determinism.
             self._locality_hints = None
diff --git a/python/ray/data/_internal/execution/operators/union_operator.py b/python/ray/data/_internal/execution/operators/union_operator.py
index 62a0f8be1ff4..d07bb4639a3d 100644
--- a/python/ray/data/_internal/execution/operators/union_operator.py
+++ b/python/ray/data/_internal/execution/operators/union_operator.py
@@ -1,7 +1,10 @@
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from typing_extensions import override
 
+if TYPE_CHECKING:
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
+
 from ray.data._internal.execution.bundle_queue import BaseBundleQueue, FIFOBundleQueue
 from ray.data._internal.execution.interfaces import (
     ExecutionOptions,
@@ -59,7 +62,11 @@ def _input_queues(self) -> List["BaseBundleQueue"]:
     def _output_queues(self) -> List["BaseBundleQueue"]:
         return [self._output_buffer]
 
-    def start(self, options: ExecutionOptions, block_ref_counter=None):
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ):
         # Whether to preserve deterministic ordering of output blocks.
         # When True, blocks are emitted in round-robin order across inputs,
         # ensuring the same input always produces the same output order.
diff --git a/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py b/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
index 97e37b863fa8..f9271d6b56c8 100644
--- a/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
+++ b/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
@@ -40,7 +40,7 @@
 from ray.data.context import DataContext
 
 if typing.TYPE_CHECKING:
-
+    from ray.data._internal.execution.block_ref_counter import BlockRefCounter
     from ray.data._internal.execution.interfaces.physical_operator import ActorPoolInfo
     from ray.data._internal.progress.base_progress import BaseProgressBar
 
@@ -490,8 +490,12 @@ def __init__(
     # Lifecycle
     # ------------------------------------------------------------------
 
-    def start(self, options: ExecutionOptions) -> None:
-        super().start(options)
+    def start(
+        self,
+        options: ExecutionOptions,
+        block_ref_counter: Optional["BlockRefCounter"] = None,
+    ) -> None:
+        super().start(options, block_ref_counter)
         self._rank_pool.start()
 
     def _add_input_inner(self, bundle: RefBundle, input_index: int) -> None:
@@ -625,6 +629,8 @@ def _on_extraction_done(
             data_task = DataOpTask(
                 task_index=rank_idx,
                 streaming_gen=block_gen,
+                block_ref_counter=self._block_ref_counter,
+                producer_id=self.id,
                 output_ready_callback=_on_bundle_ready,
                 task_done_callback=functools.partial(
                     _on_extraction_done, rank=rank_idx

From eae4a983b24e134cb850b50ac973cc88dbda1079 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 15:54:43 -0700
Subject: [PATCH 37/53] Address comments + Make BlockRefCounter mandatory in
 call chains

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../execution/interfaces/physical_operator.py | 13 +++--
 .../operators/actor_pool_map_operator.py      |  2 +-
 .../operators/base_physical_operator.py       | 17 ++----
 .../execution/operators/hash_shuffle.py       |  2 +-
 .../execution/operators/input_data_buffer.py  |  2 +-
 .../execution/operators/map_operator.py       |  2 +-
 .../execution/operators/output_splitter.py    |  2 +-
 .../shuffle_reduce_operator.py                |  2 +
 .../execution/operators/union_operator.py     |  2 +-
 .../_internal/execution/resource_manager.py   |  8 +--
 .../_internal/execution/streaming_executor.py |  8 +--
 .../_internal/gpu_shuffle/hash_shuffle.py     |  2 +-
 .../tests/test_actor_pool_map_operator.py     | 13 ++---
 .../test_executor_resource_management.py      | 17 +++---
 python/ray/data/tests/test_map_operator.py    | 24 +++++----
 python/ray/data/tests/test_operators.py       |  9 ++--
 python/ray/data/tests/test_output_splitter.py |  9 ++--
 ...st_reservation_based_resource_allocator.py | 17 ++++++
 .../ray/data/tests/test_resource_manager.py   | 26 ++++++++-
 .../ray/data/tests/test_streaming_executor.py | 53 ++++++++++++-------
 .../data/tests/unit/test_resource_manager.py  |  3 ++
 21 files changed, 146 insertions(+), 87 deletions(-)

diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
index 25f9034cadb4..8a76bca21b23 100644
--- a/python/ray/data/_internal/execution/interfaces/physical_operator.py
+++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -135,7 +135,7 @@ def __init__(
         self,
         task_index: int,
         streaming_gen: ObjectRefGenerator,
-        block_ref_counter: Optional[BlockRefCounter],
+        block_ref_counter: BlockRefCounter,
         producer_id: str,
         output_ready_callback: Callable[[RefBundle], None] = lambda bundle: None,
         task_done_callback: TaskDoneCallbackType = lambda exc, worker_stats, driver_stats: None,
@@ -177,7 +177,7 @@ def __init__(
         self._block_ready_callback = block_ready_callback
         self._metadata_ready_callback = metadata_ready_callback
         self._operator_name = operator_name
-        self._block_ref_counter: Optional[BlockRefCounter] = block_ref_counter
+        self._block_ref_counter: BlockRefCounter = block_ref_counter
         self._producer_id: str = producer_id
 
         # If the generator hasn't produced block metadata yet, or if the block metadata
@@ -300,10 +300,9 @@ def on_data_ready(self, max_bytes_to_read: Optional[int]) -> int:
                 meta_with_schema_bytes
             )
             meta = meta_with_schema.metadata
-            if self._block_ref_counter is not None:
-                self._block_ref_counter.on_block_produced(
-                    self._pending_block_ref, meta.size_bytes or 0, self._producer_id
-                )
+            self._block_ref_counter.on_block_produced(
+                self._pending_block_ref, meta.size_bytes or 0, self._producer_id
+            )
             self._output_ready_callback(
                 RefBundle(
                     [BlockEntry(self._pending_block_ref, meta)],
@@ -759,7 +758,7 @@ def num_output_splits(self) -> int:
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional[BlockRefCounter] = None,
+        block_ref_counter: BlockRefCounter,
     ) -> None:
         """Called by the executor when execution starts for an operator.
 
diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
index f5b48b6db324..76afc11e4e1e 100644
--- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
+++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py
@@ -270,7 +270,7 @@ def _apply_default_actor_task_remote_args(
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ):
         self._actor_locality_enabled = options.actor_locality_enabled
         super().start(options, block_ref_counter)
diff --git a/python/ray/data/_internal/execution/operators/base_physical_operator.py b/python/ray/data/_internal/execution/operators/base_physical_operator.py
index f0220394d2c0..2bd382dce159 100644
--- a/python/ray/data/_internal/execution/operators/base_physical_operator.py
+++ b/python/ray/data/_internal/execution/operators/base_physical_operator.py
@@ -184,22 +184,15 @@ def all_inputs_done(self) -> None:
         # NOTE: We don't account object store memory use from intermediate `bulk_fn`
         # outputs (e.g., map outputs for map-reduce).
 
-        # Snapshot input refs before calling bulk_fn. Some bulk_fns (e.g.
-        # randomize_blocks) forward input ObjectRefs unchanged to the output.
-        # We only call on_block_produced for genuinely new refs to avoid
-        # double-counting; forwarded refs stay attributed to their original producer.
         input_bundles = self._input_buffer.to_list()
-        input_refs = {entry.ref for bundle in input_bundles for entry in bundle.blocks}
         output_buffer, self._stats = self._bulk_fn(input_bundles, ctx)
         self._output_buffer = FIFOBundleQueue(output_buffer)
 
-        if self._block_ref_counter is not None:
-            for bundle in output_buffer:
-                for entry in bundle.blocks:
-                    if entry.ref not in input_refs:
-                        self._block_ref_counter.on_block_produced(
-                            entry.ref, entry.metadata.size_bytes or 0, self.id
-                        )
+        for bundle in output_buffer:
+            for entry in bundle.blocks:
+                self._block_ref_counter.on_block_produced(
+                    entry.ref, entry.metadata.size_bytes or 0, self.id
+                )
 
         while self._input_buffer.has_next():
             refs = self._input_buffer.get_next()
diff --git a/python/ray/data/_internal/execution/operators/hash_shuffle.py b/python/ray/data/_internal/execution/operators/hash_shuffle.py
index 5f5a5b58c25b..1c7a6bbe54c3 100644
--- a/python/ray/data/_internal/execution/operators/hash_shuffle.py
+++ b/python/ray/data/_internal/execution/operators/hash_shuffle.py
@@ -677,7 +677,7 @@ def __init__(
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ) -> None:
         super().start(options, block_ref_counter)
 
diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py
index 16bb37993f43..2d4c9350ad64 100644
--- a/python/ray/data/_internal/execution/operators/input_data_buffer.py
+++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py
@@ -51,7 +51,7 @@ def __init__(
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ) -> None:
         if not self._is_input_initialized:
             self._input_data = self._input_data_factory(
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
index d80788632ecd..5a72596d3af2 100644
--- a/python/ray/data/_internal/execution/operators/map_operator.py
+++ b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -490,7 +490,7 @@ def create(
     def start(
         self,
         options: "ExecutionOptions",
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ):
         super().start(options, block_ref_counter)
         # Create output queue with desired ordering semantics.
diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py
index 5180506156a3..3607f6c187bf 100644
--- a/python/ray/data/_internal/execution/operators/output_splitter.py
+++ b/python/ray/data/_internal/execution/operators/output_splitter.py
@@ -130,7 +130,7 @@ def num_output_rows_total(self) -> Optional[int]:
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ) -> None:
         if options.preserve_order:
             # If preserve_order is set, we need to ignore locality hints to ensure determinism.
diff --git a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
index 8159538c2a02..e97701d86879 100644
--- a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
+++ b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
@@ -164,6 +164,8 @@ def _add_input_inner(self, refs: RefBundle, input_index: int) -> None:
         data_task = DataOpTask(
             task_index=partition_id,
             streaming_gen=block_gen,
+            block_ref_counter=self._block_ref_counter,
+            producer_id=self.id,
             output_ready_callback=functools.partial(
                 self._handle_reduce_output_ready, partition_id
             ),
diff --git a/python/ray/data/_internal/execution/operators/union_operator.py b/python/ray/data/_internal/execution/operators/union_operator.py
index d07bb4639a3d..468d5f52f2b8 100644
--- a/python/ray/data/_internal/execution/operators/union_operator.py
+++ b/python/ray/data/_internal/execution/operators/union_operator.py
@@ -65,7 +65,7 @@ def _output_queues(self) -> List["BaseBundleQueue"]:
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ):
         # Whether to preserve deterministic ordering of output blocks.
         # When True, blocks are emitted in round-robin order across inputs,
diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
index 50ff9f5a5c08..aba473be1992 100644
--- a/python/ray/data/_internal/execution/resource_manager.py
+++ b/python/ray/data/_internal/execution/resource_manager.py
@@ -109,6 +109,7 @@ def __init__(
         options: ExecutionOptions,
         get_total_resources: Callable[[], ExecutionResources],
         data_context: DataContext,
+        block_ref_counter: BlockRefCounter,
     ):
         self._topology = topology
         self._options = options
@@ -141,7 +142,7 @@ def __init__(
         # operator's output usage.
         self._output_operator = terminal_operator_from_topology(topology)
 
-        self._block_ref_counter = BlockRefCounter()
+        self._block_ref_counter = block_ref_counter
 
         self._op_resource_allocator: Optional[
             "OpResourceAllocator"
@@ -174,11 +175,6 @@ def get_external_consumer_bytes(self) -> int:
         """Get the bytes buffered by external consumers."""
         return self._external_consumer_bytes
 
-    @property
-    def block_ref_counter(self) -> BlockRefCounter:
-        """The centralized block reference counter for this executor."""
-        return self._block_ref_counter
-
     def _estimate_object_store_memory_usage(
         self, op: "PhysicalOperator", state: "OpState"
     ) -> int:
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
index 9390927e5875..bfc081c8546a 100644
--- a/python/ray/data/_internal/execution/streaming_executor.py
+++ b/python/ray/data/_internal/execution/streaming_executor.py
@@ -14,6 +14,7 @@
     BackpressurePolicy,
     get_backpressure_policies,
 )
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.dataset_state import DatasetState
 from ray.data._internal.execution.execution_callback import ExecutionCallback
 from ray.data._internal.execution.interfaces import (
@@ -221,16 +222,17 @@ def execute(
         # Setup the streaming DAG topology and start the runner thread.
         self._topology = build_streaming_topology(dag, self._options)
 
+        self._block_ref_counter = BlockRefCounter()
         self._resource_manager = ResourceManager(
             self._topology,
             self._options,
             lambda: self._cluster_autoscaler.get_total_resources(),
             self._data_context,
+            self._block_ref_counter,
         )
 
-        counter = self._resource_manager.block_ref_counter
         for op in self._topology:
-            op.start(self._options, counter)
+            op.start(self._options, self._block_ref_counter)
 
         # Constructed once per executor (not per scheduling iteration) so the
         # guard's idle-detection state accumulates across scheduling iterations.
@@ -349,7 +351,7 @@ def shutdown(self, force: bool, exception: Optional[Exception] = None):
             self._clear_topology_queues_post_shutdown(force, exception)
             # Queues have been drained; any remaining Ray Core callbacks that fire
             # after this point should be no-ops.
-            self._resource_manager.block_ref_counter.clear()
+            self._block_ref_counter.clear()
 
             min_ = round(timer.min(), 3)
             max_ = round(timer.max(), 3)
diff --git a/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py b/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
index f9271d6b56c8..3c9e0ffb3031 100644
--- a/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
+++ b/python/ray/data/_internal/gpu_shuffle/hash_shuffle.py
@@ -493,7 +493,7 @@ def __init__(
     def start(
         self,
         options: ExecutionOptions,
-        block_ref_counter: Optional["BlockRefCounter"] = None,
+        block_ref_counter: "BlockRefCounter",
     ) -> None:
         super().start(options, block_ref_counter)
         self._rank_pool.start()
diff --git a/python/ray/data/tests/test_actor_pool_map_operator.py b/python/ray/data/tests/test_actor_pool_map_operator.py
index bd7dcdab09b5..94214fd614ef 100644
--- a/python/ray/data/tests/test_actor_pool_map_operator.py
+++ b/python/ray/data/tests/test_actor_pool_map_operator.py
@@ -28,6 +28,7 @@
     _estimate_total_available_task_slots,
 )
 from ray.data._internal.compute import ActorPoolStrategy
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.bundle_queue import HashLinkedQueue
 from ray.data._internal.execution.interfaces import (
     ExecutionOptions,
@@ -964,7 +965,7 @@ def test_setting_initial_size_for_actor_pool():
         ray_remote_args={"num_cpus": 1},
     )
 
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     assert op._actor_pool.get_actor_info() == ActorPoolInfo(
         running=0,
@@ -1005,7 +1006,7 @@ def test_internal_input_queue_is_empty_after_early_completion(
     )
 
     # NOTE: This is blocking, until actor pool is fully started up
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     # Complete init sequence by completing pending metadata tasks (performed
     # by the executor)
     run_op_tasks_sync(op)
@@ -1062,7 +1063,7 @@ def test_actor_pool_input_queue_draining(
     )
 
     # NOTE: This is blocking, until actor pool is fully started up
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     # Finalize operator initialization sequence and make it schedulable
     run_op_tasks_sync(op, only_existing=True)
@@ -1518,7 +1519,7 @@ def _fail():
     )
 
     with pytest.raises(RayActorError, match=r"init_failed"):
-        op.start(ExecutionOptions())
+        op.start(ExecutionOptions(), BlockRefCounter())
 
 
 @pytest.mark.parametrize(
@@ -1571,7 +1572,7 @@ def _failing_transform(
         ray_remote_args={"max_concurrency": max_concurrency},
     )
 
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     # Cannot add input until actor has started.
     assert not op.can_add_input()
@@ -1614,7 +1615,7 @@ def _map_transfom_fn(block_iter: Iterable[Block], _) -> Iterable[Block]:
     actor_pool = op._actor_pool
 
     # Wait for the op to scale up to the min size.
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     run_op_tasks_sync(op)
     assert actor_pool.num_running_actors() == num_actors
     assert op.num_active_tasks() == 0
diff --git a/python/ray/data/tests/test_executor_resource_management.py b/python/ray/data/tests/test_executor_resource_management.py
index 06ebefacc8aa..28cd8de28621 100644
--- a/python/ray/data/tests/test_executor_resource_management.py
+++ b/python/ray/data/tests/test_executor_resource_management.py
@@ -5,6 +5,7 @@
 import ray
 from ray.data._internal.actor_autoscaler import ActorPoolScalingRequest
 from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import ExecutionOptions, ExecutionResources
 from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
 from ray.data._internal.execution.operators.limit_operator import LimitOperator
@@ -208,7 +209,7 @@ def test_task_pool_resource_reporting(ray_start_10_cpus_shared):
         name="TestMapper",
         compute_strategy=TaskPoolStrategy(),
     )
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     assert op.current_logical_usage() == ExecutionResources(cpu=0, gpu=0, memory=0)
     assert op.metrics.obj_store_mem_internal_inqueue == 0
@@ -253,7 +254,7 @@ def test_task_pool_resource_reporting_with_dynamic_remote_args(
         ray_remote_args={"num_cpus": 1},
         ray_remote_args_fn=lambda: {"memory": 500},
     )
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     assert op.current_logical_usage() == ExecutionResources(cpu=0, gpu=0, memory=0)
 
@@ -280,7 +281,7 @@ def test_task_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared):
         compute_strategy=TaskPoolStrategy(),
         min_rows_per_bundle=3,
     )
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     assert op.current_logical_usage() == ExecutionResources(cpu=0, gpu=0, memory=0)
     assert op.metrics.obj_store_mem_internal_inqueue == 0
@@ -340,7 +341,7 @@ def test_actor_pool_scheduling(ray_start_10_cpus_shared, restore_data_context):
     )
 
     # NOTE: This is blocking, until actors are fully started up
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     min_resource_usage, _ = op.min_max_resource_requirements()
     assert min_resource_usage == ExecutionResources(cpu=2, gpu=0, object_store_memory=0)
@@ -461,7 +462,7 @@ def test_actor_pool_resource_reporting_with_dynamic_remote_args(
     )
 
     # Blocking until actors are fully started
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     run_op_tasks_sync(op, only_existing=True)
 
     # Should reflect dynamic resources: 2 actors * (1 cpu, 500 memory)
@@ -491,7 +492,7 @@ def test_actor_pool_scheduling_with_bundling(
     )
 
     # NOTE: This is blocking, until actor pool is fully started up
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     min_resource_usage, _ = op.min_max_resource_requirements()
     assert min_resource_usage == ExecutionResources(cpu=2, gpu=0, object_store_memory=0)
@@ -641,7 +642,7 @@ def test_limit_resource_reporting(ray_start_10_cpus_shared):
         make_ref_bundles([[SMALL_STR, SMALL_STR] for i in range(2)]),
     )  # Two two-row bundles
     op = LimitOperator(3, input_op, DataContext.get_current())
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
 
     assert op.current_logical_usage() == ExecutionResources(
         cpu=0, gpu=0, object_store_memory=0, memory=0
@@ -674,7 +675,7 @@ def test_output_splitter_resource_reporting(ray_start_10_cpus_shared):
         data_context=DataContext.get_current(),
         locality_hints=["0", "1"],
     )
-    op.start(ExecutionOptions(actor_locality_enabled=True))
+    op.start(ExecutionOptions(actor_locality_enabled=True), BlockRefCounter())
 
     assert op.current_logical_usage() == ExecutionResources(
         cpu=0, gpu=0, object_store_memory=0, memory=0
diff --git a/python/ray/data/tests/test_map_operator.py b/python/ray/data/tests/test_map_operator.py
index 4b3c4bee4f3a..81b6050b6b31 100644
--- a/python/ray/data/tests/test_map_operator.py
+++ b/python/ray/data/tests/test_map_operator.py
@@ -9,6 +9,7 @@
 import ray
 from ray._common.test_utils import wait_for_condition
 from ray.data._internal.compute import ActorPoolStrategy, TaskPoolStrategy
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import (
     ExecutionOptions,
 )
@@ -75,7 +76,7 @@ def _run_map_operator_test(
     )
 
     # Feed data and block on exec.
-    op.start(ExecutionOptions(preserve_order=preserve_order))
+    op.start(ExecutionOptions(preserve_order=preserve_order), BlockRefCounter())
     if use_actors:
         # Wait for actors to be ready before adding inputs.
         run_op_tasks_sync(op, only_existing=True)
@@ -116,7 +117,10 @@ def test_map_operator_streamed(ray_start_regular_shared, use_actors):
     output = []
     # Use preserve_order so output order matches input order (required for
     # actor pool, which otherwise returns results in completion order).
-    op.start(ExecutionOptions(actor_locality_enabled=True, preserve_order=True))
+    op.start(
+        ExecutionOptions(actor_locality_enabled=True, preserve_order=True),
+        BlockRefCounter(),
+    )
 
     if use_actors:
         # Wait for actors to be ready before adding inputs.
@@ -180,7 +184,7 @@ def test_map_operator_actor_locality_stats(ray_start_regular_shared):
     options = ExecutionOptions()
     options.preserve_order = True
     options.actor_locality_enabled = True
-    op.start(options)
+    op.start(options, BlockRefCounter())
     # Wait for actors to be ready before adding inputs.
     run_op_tasks_sync(op, only_existing=True)
 
@@ -242,7 +246,7 @@ def _check_batch(block_iter: Iterable[Block], ctx) -> Iterable[Block]:
     )
 
     # Feed data and block on exec.
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     if use_actors:
         # Wait for actors to be ready before adding inputs.
         run_op_tasks_sync(op, only_existing=True)
@@ -395,7 +399,7 @@ def test_map_operator_ray_args(shutdown_only, use_actors):
     )
 
     # Feed data and block on exec.
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     if use_actors:
         # Wait for the actor to start.
         run_op_tasks_sync(op)
@@ -443,7 +447,7 @@ def _sleep(block_iter: Iterable[Block]) -> Iterable[Block]:
     )
 
     # Start one task and then cancel.
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     if use_actors:
         # Wait for the actor to start.
         run_op_tasks_sync(op)
@@ -513,7 +517,7 @@ def map_fn(block_iter: Iterable[Block], ctx: TaskContext) -> Iterable[Block]:
         compute_strategy=compute_strategy,
     )
     op.add_map_task_kwargs_fn(lambda: kwargs)
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     if use_actors:
         # Wait for the actor to start.
         run_op_tasks_sync(op)
@@ -576,7 +580,7 @@ def yield_five(block_iter: Iterable[Block], ctx) -> Iterable[Block]:
         min_rows_per_bundle=min_rows_per_bundle,
     )
 
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
         if op.metrics.num_inputs_received % min_rows_per_bundle == 0:
@@ -621,7 +625,7 @@ def yield_five(block_iter: Iterable[Block], ctx) -> Iterable[Block]:
     )
     op.set_additional_split_factor(2)
 
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
         if op.metrics.num_inputs_received % min_rows_per_bundle == 0:
@@ -659,7 +663,7 @@ def map_fn(block_iter: Iterable[Block], ctx) -> Iterable[Block]:
         min_rows_per_bundle=MIN_ROWS_PER_BUNDLE,
     )
 
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     num_outputs_taken = 0
     bytes_outputs_taken = 0
     for i in range(len(inputs)):
diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py
index 003142594387..b5b1083bbc38 100644
--- a/python/ray/data/tests/test_operators.py
+++ b/python/ray/data/tests/test_operators.py
@@ -5,6 +5,7 @@
 import pytest
 
 import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import (
     BlockEntry,
     ExecutionOptions,
@@ -99,7 +100,7 @@ def dummy_all_transform(bundles: List[RefBundle], ctx):
         op.set_sub_progress_bar(name, pg)
 
     # Feed data.
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
     op.all_inputs_done()
@@ -145,7 +146,7 @@ def dummy_all_transform(bundles: List[RefBundle]):
 
     # Feed data and implement streaming exec.
     output = []
-    op1.start(ExecutionOptions(actor_locality_enabled=True))
+    op1.start(ExecutionOptions(actor_locality_enabled=True), BlockRefCounter())
     while input_op.has_next():
         op1.add_input(input_op.get_next(), 0)
         while not op1.has_next():
@@ -182,8 +183,8 @@ def all_transform(bundles: List[RefBundle], ctx):
         DataContext.get_current().target_max_block_size,
     )
 
-    op1.start(ExecutionOptions())
-    op2.start(ExecutionOptions())
+    op1.start(ExecutionOptions(), BlockRefCounter())
+    op2.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         op1.add_input(input_op.get_next(), 0)
     op1.all_inputs_done()
diff --git a/python/ray/data/tests/test_output_splitter.py b/python/ray/data/tests/test_output_splitter.py
index 8be226a25fa8..2ad05a8ed324 100644
--- a/python/ray/data/tests/test_output_splitter.py
+++ b/python/ray/data/tests/test_output_splitter.py
@@ -5,6 +5,7 @@
 import pytest
 
 import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import ExecutionOptions
 from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
 from ray.data._internal.execution.operators.output_splitter import OutputSplitter
@@ -35,7 +36,7 @@ def test_split_operator(ray_start_regular_shared, equal, chunk_size):
 
     # Feed data and implement streaming exec.
     output_splits = [[] for _ in range(num_splits)]
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         for _ in range(num_add_input_blocks):
             if not input_op.has_next():
@@ -79,7 +80,7 @@ def test_split_operator_random(ray_start_regular_shared, equal, random_seed):
 
     # Feed data and implement streaming exec.
     output_splits = collections.defaultdict(list)
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
     op.all_inputs_done()
@@ -124,7 +125,7 @@ def get_bundle_loc(bundle):
 
     # Feed data and implement streaming exec.
     output_splits = collections.defaultdict(list)
-    op.start(ExecutionOptions(actor_locality_enabled=True))
+    op.start(ExecutionOptions(actor_locality_enabled=True), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
     op.all_inputs_done()
@@ -192,7 +193,7 @@ def _get_fake_bundle_loc(bundle):
     output_splits = [[] for _ in range(3)]
     yielded_incrementally = 0
 
-    op.start(ExecutionOptions(actor_locality_enabled=True))
+    op.start(ExecutionOptions(actor_locality_enabled=True), BlockRefCounter())
     while input_op.has_next():
         op.add_input(input_op.get_next(), 0)
 
diff --git a/python/ray/data/tests/test_reservation_based_resource_allocator.py b/python/ray/data/tests/test_reservation_based_resource_allocator.py
index 444ca0946aa3..53309bb94646 100644
--- a/python/ray/data/tests/test_reservation_based_resource_allocator.py
+++ b/python/ray/data/tests/test_reservation_based_resource_allocator.py
@@ -3,6 +3,7 @@
 import pytest
 
 import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces.execution_options import (
     ExecutionOptions,
     ExecutionResources,
@@ -72,6 +73,7 @@ def mock_get_global_limits():
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = op_internal_usage
@@ -237,6 +239,7 @@ def test_reserve_min_resource_requirements(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -298,6 +301,7 @@ def test_reserve_min_resources_for_gpu_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -333,6 +337,7 @@ def test_does_not_reserve_more_than_max_resource_usage(self):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -398,6 +403,7 @@ def test_budget_capped_by_max_resource_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
@@ -475,6 +481,7 @@ def test_budget_capped_by_max_resource_usage_all_capped(self, restore_data_conte
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
@@ -509,6 +516,7 @@ def test_only_handle_eligible_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -572,6 +580,7 @@ def test_gpu_allocation(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -620,6 +629,7 @@ def test_multiple_gpu_operators(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -663,6 +673,7 @@ def test_gpu_usage_exceeds_global_limits(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2], 0)
@@ -709,6 +720,7 @@ def test_gpu_unbounded_operator_can_autoscale(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2], 0)
@@ -780,6 +792,7 @@ def test_actor_pool_gpu_operator_gets_gpu_budget_in_cpu_pipeline(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3, o4, o5], 0)
@@ -836,6 +849,7 @@ def test_gpu_bounded_vs_unbounded_operators(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -924,6 +938,7 @@ def test_gpu_not_reserved_for_non_gpu_operators(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys(ops, 0)
@@ -985,6 +1000,7 @@ def test_reservation_accounts_for_completed_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = op_internal_usage
@@ -1082,6 +1098,7 @@ def mock_get_global_limits():
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager.get_global_limits = MagicMock(
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index f06e82a74aa7..a12facc53cf6 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -10,6 +10,7 @@
 
 import ray
 from ray.data._internal.compute import ComputeStrategy
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import BlockEntry, PhysicalOperator
 from ray.data._internal.execution.interfaces.execution_options import (
     ExecutionOptions,
@@ -56,7 +57,7 @@ def mock_map_op(
         compute_strategy=compute_strategy,
         name=name,
     )
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     return op
 
 
@@ -101,7 +102,7 @@ def mock_all_to_all_op(input_op, name="MockShuffle"):
         data_context=DataContext.get_current(),
         name=name,
     )
-    op.start(ExecutionOptions())
+    op.start(ExecutionOptions(), BlockRefCounter())
     return op
 
 
@@ -119,6 +120,7 @@ def _resource_manager_for_limits_only_test(
         options,
         get_total_resources,
         DataContext.get_current(),
+        BlockRefCounter(),
     )
 
 
@@ -283,6 +285,7 @@ def test_update_usage(self):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager._op_resource_allocator = None
         resource_manager.update_usages()
@@ -345,6 +348,7 @@ def test_object_store_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources.zero()),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         ray.data.DataContext.get_current()._max_num_blocks_in_streaming_gen_buffer = 1
         ray.data.DataContext.get_current().target_max_block_size = 2
@@ -454,6 +458,7 @@ def test_object_store_accounting_delegates_to_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources.zero()),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
 
         resource_manager.update_usages()
@@ -490,6 +495,7 @@ def test_get_completed_ops_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
 
@@ -552,6 +558,7 @@ def test_get_completed_ops_usage_complex_graph(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
 
@@ -582,6 +589,7 @@ def test_external_consumer_bytes_attributed_to_terminal_operator(
             ExecutionOptions(),
             lambda: cluster_resources,
             DataContext.get_current(),
+            BlockRefCounter(),
         )
 
         for op in [o1, o2, o3]:
@@ -650,6 +658,7 @@ def test_external_consumer_bytes_input_data_buffer_sink(self, restore_data_conte
             ExecutionOptions(),
             lambda: ExecutionResources(cpu=10, gpu=0, object_store_memory=1000),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         buf.current_logical_usage = MagicMock(return_value=ExecutionResources.zero())
         buf.running_logical_usage = MagicMock(return_value=ExecutionResources.zero())
@@ -681,6 +690,7 @@ def test_external_consumer_bytes_surfaced_in_op_usage_str(
             ExecutionOptions(),
             lambda: cluster_resources,
             DataContext.get_current(),
+            BlockRefCounter(),
         )
 
         for op in [o1, o2, o3]:
@@ -723,6 +733,7 @@ def test_topology_rejects_multiple_terminal_operators(self, restore_data_context
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
+                BlockRefCounter(),
             )
 
     def test_topology_rejects_empty_topology(self, restore_data_context):
@@ -732,6 +743,7 @@ def test_topology_rejects_empty_topology(self, restore_data_context):
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
+                BlockRefCounter(),
             )
 
     def test_topology_rejects_no_terminal_operator(self, restore_data_context):
@@ -749,6 +761,7 @@ def test_topology_rejects_no_terminal_operator(self, restore_data_context):
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
+                BlockRefCounter(),
             )
 
     def test_is_blocking_materializing_op(self, restore_data_context):
@@ -777,6 +790,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
 
         # Case 1: Shuffle operator itself is blocking materializing
@@ -807,6 +821,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
 
         # o5's downstream (o6, o7) has no blocking materializing ops
@@ -834,6 +849,7 @@ def test_memory_limit_blocks_task_submission(self, restore_data_context):
             options=options,
             get_total_resources=lambda: cluster_resources,
             data_context=DataContext.get_current(),
+            block_ref_counter=BlockRefCounter(),
         )
         resource_manager.update_usages()
 
@@ -866,6 +882,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -883,6 +900,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -910,6 +928,7 @@ def test_no_unblock_backpressure_terminal_with_external_consumer(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -944,6 +963,7 @@ def test_unblock_backpressure_downstream_idle(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
         o3.num_active_tasks = MagicMock(return_value=0)
@@ -974,6 +994,7 @@ def test_unblock_backpressure_fallback_to_idle_detector(self, restore_data_conte
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -1015,6 +1036,7 @@ def test_unblock_when_resource_allocator_disabled(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
+            BlockRefCounter(),
         )
         assert not resource_manager.op_resource_allocator_enabled()
 
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py
index 5db06dfaf29f..8e0578c95caf 100644
--- a/python/ray/data/tests/test_streaming_executor.py
+++ b/python/ray/data/tests/test_streaming_executor.py
@@ -534,6 +534,7 @@ def test_process_completed_tasks_unblocks_when_non_resource_budget_policy_zeros_
         ExecutionOptions(),
         MagicMock(),
         DataContext.get_current(),
+        BlockRefCounter(),
     )
     guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -754,6 +755,7 @@ def test_debug_dump_topology(ray_start_regular_shared):
         ExecutionOptions(),
         MagicMock(return_value=ExecutionResources.zero()),
         DataContext.get_current(),
+        BlockRefCounter(),
     )
     resource_manager.update_usages()
     # Just a sanity check to ensure it doesn't crash.
@@ -1392,13 +1394,6 @@ def ensure_block_metadata_stored_in_plasma(monkeypatch):
     monkeypatch.setenv("RAY_max_direct_call_object_size", 0)
 
 
-def _make_data_op_task(task_index, streaming_gen, **kwargs):
-    """Create a DataOpTask with a default BlockRefCounter and producer_id for tests."""
-    kwargs.setdefault("block_ref_counter", BlockRefCounter())
-    kwargs.setdefault("producer_id", "test_op")
-    return DataOpTask(task_index, streaming_gen, **kwargs)
-
-
 class TestDataOpTask:
     def test_on_data_ready_single_output(self, ray_start_regular_shared):
         streaming_gen = create_stub_streaming_gen(block_nbytes=[128 * MiB])
@@ -1406,8 +1401,12 @@ def test_on_data_ready_single_output(self, ray_start_regular_shared):
         def verify_output(bundle):
             assert bundle.size_bytes() == pytest.approx(128 * MiB), bundle.size_bytes()
 
-        data_op_task = _make_data_op_task(
-            0, streaming_gen, output_ready_callback=verify_output
+        data_op_task = DataOpTask(
+            0,
+            streaming_gen,
+            BlockRefCounter(),
+            "test_op",
+            output_ready_callback=verify_output,
         )
 
         bytes_read = 0
@@ -1424,8 +1423,12 @@ def test_on_data_ready_multiple_outputs(self, ray_start_regular_shared):
         def verify_output(bundle):
             assert bundle.size_bytes() == pytest.approx(128 * MiB), bundle.size_bytes()
 
-        data_op_task = _make_data_op_task(
-            0, streaming_gen, output_ready_callback=verify_output
+        data_op_task = DataOpTask(
+            0,
+            streaming_gen,
+            BlockRefCounter(),
+            "test_op",
+            output_ready_callback=verify_output,
         )
 
         bytes_read = 0
@@ -1447,9 +1450,11 @@ def verify_exception(exc, task_exec_stats, task_exec_driver_stats):
             assert task_exec_stats is None
             assert task_exec_driver_stats is None
 
-        data_op_task = _make_data_op_task(
+        data_op_task = DataOpTask(
             0,
             streaming_gen,
+            BlockRefCounter(),
+            "test_op",
             task_done_callback=verify_exception,
         )
 
@@ -1460,11 +1465,17 @@ def verify_exception(exc, task_exec_stats, task_exec_driver_stats):
 
     def test_operator_name_parameter(self, ray_start_regular_shared):
         streaming_gen = create_stub_streaming_gen(block_nbytes=[1])
-        task = _make_data_op_task(0, streaming_gen, operator_name="MapBatches(fn)")
+        task = DataOpTask(
+            0,
+            streaming_gen,
+            BlockRefCounter(),
+            "test_op",
+            operator_name="MapBatches(fn)",
+        )
         assert task._operator_name == "MapBatches(fn)"
 
         streaming_gen2 = create_stub_streaming_gen(block_nbytes=[1])
-        task_default = _make_data_op_task(1, streaming_gen2)
+        task_default = DataOpTask(1, streaming_gen2, BlockRefCounter(), "test_op")
         assert task_default._operator_name == "Unknown"
 
     @pytest.mark.parametrize(
@@ -1501,8 +1512,12 @@ def remove_and_add_back_worker_node(_):
             new_worker_node = cluster.add_node(num_cpus=1)  # noqa: F841
             cluster.wait_for_nodes()
 
-        data_op_task = _make_data_op_task(
-            0, streaming_gen, **{preempt_on: remove_and_add_back_worker_node}
+        data_op_task = DataOpTask(
+            0,
+            streaming_gen,
+            BlockRefCounter(),
+            "test_op",
+            **{preempt_on: remove_and_add_back_worker_node},
         )
 
         # Run the task to completion.
@@ -1532,7 +1547,7 @@ def test_on_data_ready_with_preemption_after_wait(
 
         # Create a streaming generator that produces a single 128 MiB output block.
         streaming_gen = create_stub_streaming_gen(block_nbytes=[128 * MiB])
-        data_op_task = _make_data_op_task(0, streaming_gen)
+        data_op_task = DataOpTask(0, streaming_gen, BlockRefCounter(), "test_op")
 
         # Wait for the block to be ready, then remove the worker node.
         ray.wait([streaming_gen], fetch_local=False)
@@ -1572,9 +1587,11 @@ def capture_done(exc, task_exec_stats, task_exec_driver_stats):
             captured_stats["task_exec_stats"] = task_exec_stats
             captured_stats["task_exec_driver_stats"] = task_exec_driver_stats
 
-        data_op_task = _make_data_op_task(
+        data_op_task = DataOpTask(
             0,
             streaming_gen,
+            BlockRefCounter(),
+            "test_op",
             task_done_callback=capture_done,
         )
 
diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py
index de9aa94400cc..cfeb83848b77 100644
--- a/python/ray/data/tests/unit/test_resource_manager.py
+++ b/python/ray/data/tests/unit/test_resource_manager.py
@@ -1,6 +1,7 @@
 import pytest
 
 import ray
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import (
     BlockEntry,
     PhysicalOperator,
@@ -133,6 +134,7 @@ def test_does_not_double_count_usage_from_union():
         ExecutionOptions(),
         lambda: total_resources,
         DataContext.get_current(),
+        BlockRefCounter(),
     )
 
     # Create two 1-byte `RefBundle`s.
@@ -195,6 +197,7 @@ def test_per_input_inqueue_attribution_for_union():
         options,
         lambda: total_resources,
         DataContext.get_current(),
+        BlockRefCounter(),
     )
 
     # Create two 10-byte RefBundles with distinct block refs (simulates real execution

From 063a504d6724093ea7da570b19e0ef6e59e0308a Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 16:53:59 -0700
Subject: [PATCH 38/53] Track blocks for limit, zip, and output splitter

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../data/_internal/execution/operators/limit_operator.py    | 4 ++++
 .../data/_internal/execution/operators/output_splitter.py   | 5 +++++
 .../ray/data/_internal/execution/operators/zip_operator.py  | 6 +++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/_internal/execution/operators/limit_operator.py b/python/ray/data/_internal/execution/operators/limit_operator.py
index 88dda94cf5fa..522800e557b4 100644
--- a/python/ray/data/_internal/execution/operators/limit_operator.py
+++ b/python/ray/data/_internal/execution/operators/limit_operator.py
@@ -81,6 +81,10 @@ def slice_fn(block, metadata, num_rows) -> Tuple[Block, BlockMetadata]:
                 )
                 out_blocks.append(block)
                 metadata = ray.get(metadata_ref)
+                # Slicing creates a new block; register it for memory tracking.
+                self._block_ref_counter.on_block_produced(
+                    block, metadata.size_bytes or 0, self.id
+                )
                 out_metadata.append(metadata)
                 self._output_blocks_stats.append(metadata.to_stats())
                 self._consumed_rows = self._limit
diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py
index 3607f6c187bf..bf866dc841a8 100644
--- a/python/ray/data/_internal/execution/operators/output_splitter.py
+++ b/python/ray/data/_internal/execution/operators/output_splitter.py
@@ -210,6 +210,11 @@ def all_inputs_done(self) -> None:
         for i, count in enumerate(allocation):
             bundles = self._split_from_buffer(count)
             for b in bundles:
+                # Splitting may create new blocks; register for memory tracking.
+                for entry in b.blocks:
+                    self._block_ref_counter.on_block_produced(
+                        entry.ref, entry.metadata.size_bytes or 0, self.id
+                    )
                 b = replace(b, output_split_idx=i)
                 self._output_queue.add(b)
                 self._metrics.on_output_queued(b)
diff --git a/python/ray/data/_internal/execution/operators/zip_operator.py b/python/ray/data/_internal/execution/operators/zip_operator.py
index 2932dcae9e6e..6d6679e682ba 100644
--- a/python/ray/data/_internal/execution/operators/zip_operator.py
+++ b/python/ray/data/_internal/execution/operators/zip_operator.py
@@ -122,8 +122,12 @@ def all_inputs_done(self) -> None:
                 refs = input_buffer.get_next()
                 self._metrics.on_input_dequeued(refs, input_index=idx)
 
-        # Mark outputs as ready
+        # Zipping creates new blocks; register them for memory tracking.
         for ref in self._output_buffer:
+            for entry in ref.blocks:
+                self._block_ref_counter.on_block_produced(
+                    entry.ref, entry.metadata.size_bytes or 0, self.id
+                )
             self._metrics.on_output_queued(ref)
 
         super().all_inputs_done()

From fd4281df58d42c0c0a91b35887cf1fba70be379a Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 17:08:21 -0700
Subject: [PATCH 39/53] Track blocks for aggregate num rows

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../data/_internal/execution/operators/aggregate_num_rows.py   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/ray/data/_internal/execution/operators/aggregate_num_rows.py b/python/ray/data/_internal/execution/operators/aggregate_num_rows.py
index 1e127ec1fb96..92874da55da4 100644
--- a/python/ray/data/_internal/execution/operators/aggregate_num_rows.py
+++ b/python/ray/data/_internal/execution/operators/aggregate_num_rows.py
@@ -53,6 +53,9 @@ def _get_next_inner(self) -> RefBundle:
             [BlockEntry(block_ref, metadata)], owns_blocks=True, schema=schema
         )
 
+        self._block_ref_counter.on_block_produced(
+            block_ref, metadata.size_bytes or 0, self.id
+        )
         self._has_outputted = True
         return bundle
 

From f4d9156294974414a94b32fbb01b7f5c097d0e88 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 18:45:08 -0700
Subject: [PATCH 40/53] Track blocks for shuffle reduce

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../operators/shuffle_operators/shuffle_reduce_operator.py    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
index e97701d86879..726f36a74312 100644
--- a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
+++ b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
@@ -209,6 +209,10 @@ def _emit_empty_partition(self, refs: RefBundle, schema: pa.Schema) -> None:
         )
         refs.destroy_if_owned()
 
+        for entry in out_bundle.blocks:
+            self._block_ref_counter.on_block_produced(
+                entry.ref, entry.metadata.size_bytes or 0, self.id
+            )
         self._num_reduce_tasks_submitted += 1
         self._output_queue.append(out_bundle)
         self._metrics.on_output_queued(out_bundle)

From ba10c6be9d60b3005ae1f027eedcc8ce23b6585a Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 18:45:57 -0700
Subject: [PATCH 41/53] simplify shuffle reduce memory tracking logic

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../shuffle_operators/shuffle_reduce_operator.py          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
index 726f36a74312..6981d99201a2 100644
--- a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
+++ b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
@@ -209,10 +209,10 @@ def _emit_empty_partition(self, refs: RefBundle, schema: pa.Schema) -> None:
         )
         refs.destroy_if_owned()
 
-        for entry in out_bundle.blocks:
-            self._block_ref_counter.on_block_produced(
-                entry.ref, entry.metadata.size_bytes or 0, self.id
-            )
+        # Empty partition creates a new block; register it for memory tracking.
+        self._block_ref_counter.on_block_produced(
+            out_bundle.blocks[0].ref, block_meta.size_bytes or 0, self.id
+        )
         self._num_reduce_tasks_submitted += 1
         self._output_queue.append(out_bundle)
         self._metrics.on_output_queued(out_bundle)

From bd3487d190879e7ed200987a2578e00fc3899b52 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 20:29:28 -0700
Subject: [PATCH 42/53] Add argument for blockRefCounter

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 ...st_reservation_based_resource_allocator.py | 32 +++++------
 .../ray/data/tests/test_resource_manager.py   | 54 +++++++++++--------
 .../data/tests/unit/test_resource_manager.py  |  4 +-
 3 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/python/ray/data/tests/test_reservation_based_resource_allocator.py b/python/ray/data/tests/test_reservation_based_resource_allocator.py
index 53309bb94646..962ddc6eaa3b 100644
--- a/python/ray/data/tests/test_reservation_based_resource_allocator.py
+++ b/python/ray/data/tests/test_reservation_based_resource_allocator.py
@@ -73,7 +73,7 @@ def mock_get_global_limits():
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = op_internal_usage
@@ -239,7 +239,7 @@ def test_reserve_min_resource_requirements(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -301,7 +301,7 @@ def test_reserve_min_resources_for_gpu_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -337,7 +337,7 @@ def test_does_not_reserve_more_than_max_resource_usage(self):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -403,7 +403,7 @@ def test_budget_capped_by_max_resource_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
@@ -481,7 +481,7 @@ def test_budget_capped_by_max_resource_usage_all_capped(self, restore_data_conte
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = {o1: 0, o2: 40, o3: 40}
@@ -516,7 +516,7 @@ def test_only_handle_eligible_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(
             return_value=ExecutionResources.zero()
@@ -580,7 +580,7 @@ def test_gpu_allocation(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -629,7 +629,7 @@ def test_multiple_gpu_operators(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -673,7 +673,7 @@ def test_gpu_usage_exceeds_global_limits(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2], 0)
@@ -720,7 +720,7 @@ def test_gpu_unbounded_operator_can_autoscale(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2], 0)
@@ -792,7 +792,7 @@ def test_actor_pool_gpu_operator_gets_gpu_budget_in_cpu_pipeline(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3, o4, o5], 0)
@@ -849,7 +849,7 @@ def test_gpu_bounded_vs_unbounded_operators(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0)
@@ -938,7 +938,7 @@ def test_gpu_not_reserved_for_non_gpu_operators(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = dict.fromkeys(ops, 0)
@@ -1000,7 +1000,7 @@ def test_reservation_accounts_for_completed_ops(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager._mem_op_internal = op_internal_usage
@@ -1098,7 +1098,7 @@ def mock_get_global_limits():
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
         resource_manager.get_global_limits = MagicMock(
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index a12facc53cf6..2d2ea8c0726d 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -57,7 +57,10 @@ def mock_map_op(
         compute_strategy=compute_strategy,
         name=name,
     )
-    op.start(ExecutionOptions(), BlockRefCounter())
+    op.start(
+        ExecutionOptions(),
+        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+    )
     return op
 
 
@@ -102,7 +105,10 @@ def mock_all_to_all_op(input_op, name="MockShuffle"):
         data_context=DataContext.get_current(),
         name=name,
     )
-    op.start(ExecutionOptions(), BlockRefCounter())
+    op.start(
+        ExecutionOptions(),
+        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+    )
     return op
 
 
@@ -120,7 +126,7 @@ def _resource_manager_for_limits_only_test(
         options,
         get_total_resources,
         DataContext.get_current(),
-        BlockRefCounter(),
+        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
     )
 
 
@@ -285,7 +291,7 @@ def test_update_usage(self):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager._op_resource_allocator = None
         resource_manager.update_usages()
@@ -348,7 +354,7 @@ def test_object_store_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources.zero()),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         ray.data.DataContext.get_current()._max_num_blocks_in_streaming_gen_buffer = 1
         ray.data.DataContext.get_current().target_max_block_size = 2
@@ -458,7 +464,7 @@ def test_object_store_accounting_delegates_to_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources.zero()),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
 
         resource_manager.update_usages()
@@ -495,7 +501,7 @@ def test_get_completed_ops_usage(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
 
@@ -558,7 +564,7 @@ def test_get_completed_ops_usage_complex_graph(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op])
 
@@ -589,7 +595,7 @@ def test_external_consumer_bytes_attributed_to_terminal_operator(
             ExecutionOptions(),
             lambda: cluster_resources,
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
 
         for op in [o1, o2, o3]:
@@ -658,7 +664,7 @@ def test_external_consumer_bytes_input_data_buffer_sink(self, restore_data_conte
             ExecutionOptions(),
             lambda: ExecutionResources(cpu=10, gpu=0, object_store_memory=1000),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         buf.current_logical_usage = MagicMock(return_value=ExecutionResources.zero())
         buf.running_logical_usage = MagicMock(return_value=ExecutionResources.zero())
@@ -690,7 +696,7 @@ def test_external_consumer_bytes_surfaced_in_op_usage_str(
             ExecutionOptions(),
             lambda: cluster_resources,
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
 
         for op in [o1, o2, o3]:
@@ -733,7 +739,7 @@ def test_topology_rejects_multiple_terminal_operators(self, restore_data_context
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
-                BlockRefCounter(),
+                BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
             )
 
     def test_topology_rejects_empty_topology(self, restore_data_context):
@@ -743,7 +749,7 @@ def test_topology_rejects_empty_topology(self, restore_data_context):
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
-                BlockRefCounter(),
+                BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
             )
 
     def test_topology_rejects_no_terminal_operator(self, restore_data_context):
@@ -761,7 +767,7 @@ def test_topology_rejects_no_terminal_operator(self, restore_data_context):
                 ExecutionOptions(),
                 MagicMock(return_value=ExecutionResources.zero()),
                 DataContext.get_current(),
-                BlockRefCounter(),
+                BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
             )
 
     def test_is_blocking_materializing_op(self, restore_data_context):
@@ -790,7 +796,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
 
         # Case 1: Shuffle operator itself is blocking materializing
@@ -821,7 +827,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
 
         # o5's downstream (o6, o7) has no blocking materializing ops
@@ -849,7 +855,9 @@ def test_memory_limit_blocks_task_submission(self, restore_data_context):
             options=options,
             get_total_resources=lambda: cluster_resources,
             data_context=DataContext.get_current(),
-            block_ref_counter=BlockRefCounter(),
+            block_ref_counter=BlockRefCounter(
+                add_object_out_of_scope_callback=lambda *_: True
+            ),
         )
         resource_manager.update_usages()
 
@@ -882,7 +890,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -900,7 +908,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -928,7 +936,7 @@ def test_no_unblock_backpressure_terminal_with_external_consumer(
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -963,7 +971,7 @@ def test_unblock_backpressure_downstream_idle(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
         o3.num_active_tasks = MagicMock(return_value=0)
@@ -994,7 +1002,7 @@ def test_unblock_backpressure_fallback_to_idle_detector(self, restore_data_conte
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         guard = OutputBackpressureGuard(topo, resource_manager)
 
@@ -1036,7 +1044,7 @@ def test_unblock_when_resource_allocator_disabled(self, restore_data_context):
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(),
+            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         assert not resource_manager.op_resource_allocator_enabled()
 
diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py
index cfeb83848b77..6a349071e6c9 100644
--- a/python/ray/data/tests/unit/test_resource_manager.py
+++ b/python/ray/data/tests/unit/test_resource_manager.py
@@ -134,7 +134,7 @@ def test_does_not_double_count_usage_from_union():
         ExecutionOptions(),
         lambda: total_resources,
         DataContext.get_current(),
-        BlockRefCounter(),
+        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
     )
 
     # Create two 1-byte `RefBundle`s.
@@ -197,7 +197,7 @@ def test_per_input_inqueue_attribution_for_union():
         options,
         lambda: total_resources,
         DataContext.get_current(),
-        BlockRefCounter(),
+        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
     )
 
     # Create two 10-byte RefBundles with distinct block refs (simulates real execution

From a421eaee557e435d2e15d670438cea02deb390b0 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 25 Jun 2026 12:28:36 -0700
Subject: [PATCH 43/53] Add block_ref_counter argument to
 build_streaming_topology

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../execution/interfaces/physical_operator.py |  1 -
 .../_internal/execution/streaming_executor.py |  9 ++--
 .../execution/streaming_executor_state.py     |  8 +++-
 python/ray/data/tests/conftest.py             |  6 +++
 .../tests/test_actor_pool_map_operator.py     |  5 +-
 ...st_reservation_based_resource_allocator.py | 33 ++++++-------
 .../ray/data/tests/test_resource_manager.py   | 39 +++++++--------
 .../ray/data/tests/test_streaming_executor.py | 47 ++++++++++++-------
 .../data/tests/unit/test_resource_manager.py  |  5 +-
 9 files changed, 89 insertions(+), 64 deletions(-)

diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
index 8a76bca21b23..17373f07e269 100644
--- a/python/ray/data/_internal/execution/interfaces/physical_operator.py
+++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -455,7 +455,6 @@ def __init__(
         self._id = str(uuid.uuid4())
         # Initialize metrics after data_context is set
         self._metrics = OpRuntimeMetrics(self)
-        self._block_ref_counter: Optional[BlockRefCounter] = None
 
     def __reduce__(self):
         raise ValueError("Operator is not serializable.")
diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py
index bfc081c8546a..1bf651f9035c 100644
--- a/python/ray/data/_internal/execution/streaming_executor.py
+++ b/python/ray/data/_internal/execution/streaming_executor.py
@@ -220,9 +220,11 @@ def execute(
                 )
 
         # Setup the streaming DAG topology and start the runner thread.
-        self._topology = build_streaming_topology(dag, self._options)
-
         self._block_ref_counter = BlockRefCounter()
+        self._topology = build_streaming_topology(
+            dag, self._options, self._block_ref_counter
+        )
+
         self._resource_manager = ResourceManager(
             self._topology,
             self._options,
@@ -231,9 +233,6 @@ def execute(
             self._block_ref_counter,
         )
 
-        for op in self._topology:
-            op.start(self._options, self._block_ref_counter)
-
         # Constructed once per executor (not per scheduling iteration) so the
         # guard's idle-detection state accumulates across scheduling iterations.
         self._output_backpressure_guard = OutputBackpressureGuard(
diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py
index 1c691f13bf23..25446a3fe9b8 100644
--- a/python/ray/data/_internal/execution/streaming_executor_state.py
+++ b/python/ray/data/_internal/execution/streaming_executor_state.py
@@ -14,6 +14,7 @@
 import ray
 from ray.data._internal.actor_autoscaler.autoscaling_actor_pool import ActorPoolInfo
 from ray.data._internal.execution.backpressure_policy import BackpressurePolicy
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.bundle_queue import (
     ThreadSafeBundleQueue,
     create_bundle_queue,
@@ -543,7 +544,9 @@ def mark_finished(self, exception: Optional[Exception] = None):
 
 
 def build_streaming_topology(
-    dag: PhysicalOperator, options: ExecutionOptions
+    dag: PhysicalOperator,
+    options: ExecutionOptions,
+    block_ref_counter: BlockRefCounter,
 ) -> Topology:
     """Instantiate the streaming operator state topology for the given DAG.
 
@@ -554,6 +557,8 @@ def build_streaming_topology(
     Args:
         dag: The operator DAG to instantiate.
         options: The execution options to use to start operators.
+        block_ref_counter: The executor-wide shared counter for tracking
+            object-store memory.
 
     Returns:
         The topology dict holding the streaming execution state.
@@ -575,6 +580,7 @@ def setup_state(op: PhysicalOperator) -> OpState:
         # Create state.
         op_state = OpState(op, inqueues)
         topology[op] = op_state
+        op.start(options, block_ref_counter)
         return op_state
 
     setup_state(dag)
diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py
index 6df08575a778..b71e26a17883 100644
--- a/python/ray/data/tests/conftest.py
+++ b/python/ray/data/tests/conftest.py
@@ -13,6 +13,7 @@
 import ray
 from ray._common.test_utils import wait_for_condition
 from ray._private.internal_api import get_memory_info_reply, get_state_from_address
+from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.operators.base_physical_operator import (
     AllToAllOperator,
 )
@@ -48,6 +49,11 @@ def mock_all_to_all_op(input_op, name="MockAllToAll"):
     return op
 
 
+def noop_counter():
+    """BlockRefCounter that works without a Ray cluster."""
+    return BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True)
+
+
 @pytest.fixture(scope="module")
 def data_context_override(request):
     overrides = getattr(request, "param", {})
diff --git a/python/ray/data/tests/test_actor_pool_map_operator.py b/python/ray/data/tests/test_actor_pool_map_operator.py
index 94214fd614ef..75c48b00ab96 100644
--- a/python/ray/data/tests/test_actor_pool_map_operator.py
+++ b/python/ray/data/tests/test_actor_pool_map_operator.py
@@ -57,6 +57,7 @@
     DEFAULT_ACTOR_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR,
     DataContext,
 )
+from ray.data.tests.conftest import noop_counter
 from ray.data.tests.test_executor_resource_management import SMALL_STR
 from ray.data.tests.test_operators import _mul2_map_data_prcessor
 from ray.data.tests.util import (
@@ -1285,7 +1286,9 @@ def test_completed_when_downstream_op_has_finished_execution(ray_start_regular_s
     downstream_op = IdentityOperator(
         "Downstream", input_dependencies=[actor_pool_map_op], data_context=data_context
     )
-    topology = build_streaming_topology(downstream_op, ExecutionOptions())
+    topology = build_streaming_topology(
+        downstream_op, ExecutionOptions(), noop_counter()
+    )
 
     # SETUP: Add a bundle to the upstream operator's external output queue. This is
     # necessary to reproduce the bug where the actor pool operator wouldn't complete if
diff --git a/python/ray/data/tests/test_reservation_based_resource_allocator.py b/python/ray/data/tests/test_reservation_based_resource_allocator.py
index 962ddc6eaa3b..f68a341e1ab1 100644
--- a/python/ray/data/tests/test_reservation_based_resource_allocator.py
+++ b/python/ray/data/tests/test_reservation_based_resource_allocator.py
@@ -21,6 +21,7 @@
 from ray.data._internal.execution.util import make_ref_bundles
 from ray.data.context import DataContext
 from ray.data.tests.conftest import *  # noqa
+from ray.data.tests.conftest import noop_counter
 from ray.data.tests.test_resource_manager import (
     mock_join_op,
     mock_map_op,
@@ -60,7 +61,7 @@ def test_basic(self, restore_data_context):
         op_internal_usage = dict.fromkeys([o1, o2, o3, o4], 0)
         op_outputs_usages = dict.fromkeys([o1, o2, o3, o4], 0)
 
-        topo = build_streaming_topology(o4, ExecutionOptions())
+        topo = build_streaming_topology(o4, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources.zero()
 
@@ -232,7 +233,7 @@ def test_reserve_min_resource_requirements(self, restore_data_context):
                 )
             )
 
-        topo = build_streaming_topology(o5, ExecutionOptions())
+        topo = build_streaming_topology(o5, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -294,7 +295,7 @@ def test_reserve_min_resources_for_gpu_ops(self, restore_data_context):
             )
         )
 
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -331,7 +332,7 @@ def test_does_not_reserve_more_than_max_resource_usage(self):
                 ExecutionResources(cpu=1, object_store_memory=1),
             )
         )
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -388,7 +389,7 @@ def test_budget_capped_by_max_resource_usage(self, restore_data_context):
             )
         )
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(cpu=20, object_store_memory=400)
 
@@ -466,7 +467,7 @@ def test_budget_capped_by_max_resource_usage_all_capped(self, restore_data_conte
             )
         )
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(cpu=20, object_store_memory=400)
 
@@ -509,7 +510,7 @@ def test_only_handle_eligible_ops(self, restore_data_context):
         o1 = InputDataBuffer(DataContext.get_current(), input)
         o2 = mock_map_op(o1)
         o3 = LimitOperator(1, o2, DataContext.get_current())
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -566,7 +567,7 @@ def test_gpu_allocation(self, restore_data_context):
             return_value=(ExecutionResources(0, 1, 0), ExecutionResources.inf())
         )
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(gpu=4)
         op_usages = {
@@ -615,7 +616,7 @@ def test_multiple_gpu_operators(self, restore_data_context):
             return_value=(ExecutionResources(0, 1, 0), ExecutionResources(0, 1, 0))
         )
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(gpu=4)
         op_usages = {
@@ -657,7 +658,7 @@ def test_gpu_usage_exceeds_global_limits(self, restore_data_context):
             return_value=(ExecutionResources(0, 1, 0), ExecutionResources(0, 2, 0))
         )
 
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(gpu=1)
         op_usages = {
@@ -707,7 +708,7 @@ def test_gpu_unbounded_operator_can_autoscale(self, restore_data_context):
             return_value=(ExecutionResources(0, 1, 0), ExecutionResources.inf())
         )
 
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(gpu=8)
         op_usages = {
@@ -771,7 +772,7 @@ def test_actor_pool_gpu_operator_gets_gpu_budget_in_cpu_pipeline(
         )
         o5 = mock_map_op(o4, ray_remote_args={"num_cpus": 1}, name="Write")
 
-        topo = build_streaming_topology(o5, ExecutionOptions())
+        topo = build_streaming_topology(o5, ExecutionOptions(), noop_counter())
 
         # Cluster with 2 GPUs available
         global_limits = ExecutionResources(
@@ -835,7 +836,7 @@ def test_gpu_bounded_vs_unbounded_operators(self, restore_data_context):
             return_value=(ExecutionResources(0, 1, 0), ExecutionResources.inf())
         )
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(gpu=8)
         op_usages = {
@@ -927,7 +928,7 @@ def test_gpu_not_reserved_for_non_gpu_operators(
             )
         )
 
-        topo = build_streaming_topology(write_op, ExecutionOptions())
+        topo = build_streaming_topology(write_op, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(cpu=8, gpu=8, object_store_memory=10_000_000)
         ops = [o1, read_op, infer1_op, infer2_op, write_op]
@@ -991,7 +992,7 @@ def test_reservation_accounts_for_completed_ops(self, restore_data_context):
         op_internal_usage = dict.fromkeys([o1, o2, o3, o4], 0)
         op_outputs_usages = dict.fromkeys([o1, o2, o3, o4], 0)
 
-        topo = build_streaming_topology(o4, ExecutionOptions())
+        topo = build_streaming_topology(o4, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources(cpu=10, object_store_memory=250)
 
@@ -1085,7 +1086,7 @@ def test_reservation_accounts_for_completed_ops_complex_graph(
         op_internal_usage = dict.fromkeys([o1, o2, o3, o4, o5, o6, o7, o8], 0)
         op_outputs_usages = dict.fromkeys([o1, o2, o3, o4, o5, o6, o7, o8], 0)
 
-        topo = build_streaming_topology(o8, ExecutionOptions())
+        topo = build_streaming_topology(o8, ExecutionOptions(), noop_counter())
 
         global_limits = ExecutionResources.zero()
 
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index 2d2ea8c0726d..17bfa09dbb9c 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -41,6 +41,7 @@
 from ray.data.block import TaskExecWorkerStats
 from ray.data.context import DataContext
 from ray.data.tests.conftest import *  # noqa
+from ray.data.tests.conftest import noop_counter
 
 
 def mock_map_op(
@@ -57,10 +58,6 @@ def mock_map_op(
         compute_strategy=compute_strategy,
         name=name,
     )
-    op.start(
-        ExecutionOptions(),
-        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
-    )
     return op
 
 
@@ -228,7 +225,7 @@ def test_update_usage(self):
         o1 = InputDataBuffer(DataContext.get_current(), [])
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         # Mock different metrics that contribute to the resource usage.
         mock_cpu = {
@@ -348,7 +345,7 @@ def test_object_store_usage(self, restore_data_context):
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -458,7 +455,7 @@ def test_object_store_accounting_delegates_to_op(self, restore_data_context):
             internal=42, outputs=100
         )
 
-        topo = build_streaming_topology(override, ExecutionOptions())
+        topo = build_streaming_topology(override, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -486,7 +483,7 @@ def test_get_completed_ops_usage(self, restore_data_context):
         o1.mark_execution_finished()
         o2.mark_execution_finished()
 
-        topo = build_streaming_topology(o5, ExecutionOptions())
+        topo = build_streaming_topology(o5, ExecutionOptions(), noop_counter())
 
         op_usages = {
             o1: ExecutionResources.zero(),
@@ -546,7 +543,7 @@ def test_get_completed_ops_usage_complex_graph(self, restore_data_context):
         o5.mark_execution_finished()
         o7.mark_execution_finished()
 
-        topo = build_streaming_topology(o8, ExecutionOptions())
+        topo = build_streaming_topology(o8, ExecutionOptions(), noop_counter())
 
         op_usages = {
             o1: ExecutionResources.zero(),
@@ -589,7 +586,7 @@ def test_external_consumer_bytes_attributed_to_terminal_operator(
         o1.mark_execution_finished()
         o2.mark_execution_finished()
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -658,7 +655,7 @@ def test_external_consumer_bytes_input_data_buffer_sink(self, restore_data_conte
         attach to that terminal sink instead of being dropped by the
         InputDataBuffer early return."""
         buf = InputDataBuffer(DataContext.get_current(), [])
-        topo = build_streaming_topology(buf, ExecutionOptions())
+        topo = build_streaming_topology(buf, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -690,7 +687,7 @@ def test_external_consumer_bytes_surfaced_in_op_usage_str(
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -789,7 +786,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
         o4 = mock_all_to_all_op(o3, name="Sort")
         o5 = mock_map_op(o4, name="Map2")
 
-        topo = build_streaming_topology(o5, ExecutionOptions())
+        topo = build_streaming_topology(o5, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -821,7 +818,7 @@ def test_is_blocking_materializing_op(self, restore_data_context):
         o6 = LimitOperator(1, o5, DataContext.get_current())
         o7 = mock_map_op(o6, name="Map3")
 
-        topo2 = build_streaming_topology(o7, ExecutionOptions())
+        topo2 = build_streaming_topology(o7, ExecutionOptions(), noop_counter())
         resource_manager2 = ResourceManager(
             topo2,
             ExecutionOptions(),
@@ -847,7 +844,7 @@ def test_memory_limit_blocks_task_submission(self, restore_data_context):
             name="HighMemoryTask",
         )
 
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
         options = ExecutionOptions()
 
         resource_manager = ResourceManager(
@@ -883,7 +880,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
         o2 = mock_map_op(o1)
         o3 = LimitOperator(1, o2, DataContext.get_current())
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -901,7 +898,7 @@ def test_unblock_backpressure_terminal_operator(self, restore_data_context):
         # Add o4 operator - o2 is no longer terminal
         o4 = mock_map_op(o3)
 
-        topo = build_streaming_topology(o4, ExecutionOptions())
+        topo = build_streaming_topology(o4, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -929,7 +926,7 @@ def test_no_unblock_backpressure_terminal_with_external_consumer(
         o2 = mock_map_op(o1)
         o3 = LimitOperator(1, o2, DataContext.get_current())
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -964,7 +961,7 @@ def test_unblock_backpressure_downstream_idle(self, restore_data_context):
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -995,7 +992,7 @@ def test_unblock_backpressure_fallback_to_idle_detector(self, restore_data_conte
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
@@ -1037,7 +1034,7 @@ def test_unblock_when_resource_allocator_disabled(self, restore_data_context):
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
-        topo = build_streaming_topology(o3, ExecutionOptions())
+        topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
         resource_manager = ResourceManager(
             topo,
diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py
index 8e0578c95caf..199a4f74d42a 100644
--- a/python/ray/data/tests/test_streaming_executor.py
+++ b/python/ray/data/tests/test_streaming_executor.py
@@ -70,6 +70,7 @@
 from ray.data.block import BlockAccessor, BlockMetadataWithSchema, TaskExecWorkerStats
 from ray.data.context import EXECUTION_CALLBACKS_ENV_VAR, DataContext
 from ray.data.tests.conftest import *  # noqa
+from ray.data.tests.conftest import noop_counter
 
 
 def mock_resource_manager(
@@ -125,7 +126,7 @@ def test_build_streaming_topology(verbose_progress, ray_start_regular_shared):
         DataContext.get_current(),
     )
     topo = build_streaming_topology(
-        o3, ExecutionOptions(verbose_progress=verbose_progress)
+        o3, ExecutionOptions(verbose_progress=verbose_progress), noop_counter()
     )
     assert len(topo) == 3, topo
     assert o1 in topo, topo
@@ -155,7 +156,9 @@ def test_disallow_non_unique_operators(ray_start_regular_shared):
         DataContext.get_current(),
     )
     with pytest.raises(ValueError):
-        build_streaming_topology(o4, ExecutionOptions(verbose_progress=True))
+        build_streaming_topology(
+            o4, ExecutionOptions(verbose_progress=True), noop_counter()
+        )
 
 
 def _make_disabled_guard() -> MagicMock:
@@ -180,7 +183,9 @@ def test_process_completed_tasks(sleep_task_ref, ray_start_regular_shared):
         o1,
         DataContext.get_current(),
     )
-    topo = build_streaming_topology(o2, ExecutionOptions(verbose_progress=True))
+    topo = build_streaming_topology(
+        o2, ExecutionOptions(verbose_progress=True), noop_counter()
+    )
 
     # Test processing output bundles.
     assert len(topo[o1].output_queue) == 0, topo
@@ -229,7 +234,9 @@ def test_process_completed_tasks(sleep_task_ref, ray_start_regular_shared):
         o2,
         DataContext.get_current(),
     )
-    topo = build_streaming_topology(o3, ExecutionOptions(verbose_progress=True))
+    topo = build_streaming_topology(
+        o3, ExecutionOptions(verbose_progress=True), noop_counter()
+    )
 
     o3.mark_execution_finished()
     o2.mark_execution_finished = MagicMock()
@@ -254,7 +261,9 @@ def test_update_operator_states_drains_upstream(ray_start_regular_shared):
         o2,
         DataContext.get_current(),
     )
-    topo = build_streaming_topology(o3, ExecutionOptions(verbose_progress=True))
+    topo = build_streaming_topology(
+        o3, ExecutionOptions(verbose_progress=True), noop_counter()
+    )
 
     # First, populate the upstream output queues by processing some tasks
     process_completed_tasks(topo, [], 0, _make_disabled_guard())
@@ -299,7 +308,7 @@ def test_get_eligible_operators_to_run(ray_start_regular_shared):
         DataContext.get_current(),
         name="O3",
     )
-    topo = build_streaming_topology(o3, opts)
+    topo = build_streaming_topology(o3, opts, noop_counter())
 
     resource_manager = mock_resource_manager(
         global_limits=ExecutionResources.for_limits(1, 1, 1),
@@ -374,7 +383,7 @@ def test_backpressure_policy_tracking(ray_start_regular_shared):
         DataContext.get_current(),
         name="O2",
     )
-    topo = build_streaming_topology(o2, opts)
+    topo = build_streaming_topology(o2, opts, noop_counter())
 
     # Add input to o2's input queue so it becomes eligible
     topo[o1].output_queue.append(make_ref_bundle("dummy1"))
@@ -452,7 +461,7 @@ def test_output_backpressure_policy_tracking(ray_start_regular_shared):
         DataContext.get_current(),
         name="O2",
     )
-    topo = build_streaming_topology(o2, opts)
+    topo = build_streaming_topology(o2, opts, noop_counter())
 
     # Create mock backpressure policies for output limiting with name property
     class LimitingPolicy:
@@ -527,7 +536,7 @@ def test_process_completed_tasks_unblocks_when_non_resource_budget_policy_zeros_
         DataContext.get_current(),
         name="O2",
     )
-    topo = build_streaming_topology(o2, ExecutionOptions())
+    topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
 
     resource_manager = ResourceManager(
         topo,
@@ -576,7 +585,7 @@ def test_summary_str_backpressure_policies(ray_start_regular_shared):
         DataContext.get_current(),
         name="O2",
     )
-    topo = build_streaming_topology(o2, opts)
+    topo = build_streaming_topology(o2, opts, noop_counter())
 
     resource_manager = mock_resource_manager()
 
@@ -681,7 +690,7 @@ def _get_op_usage_mocked(op):
         # Case 1: Should pick the `o4` since it has throttling disabled
         _mock.return_value = [o1, o2, o3, o4]
 
-        topo = build_streaming_topology(o4, opts)
+        topo = build_streaming_topology(o4, opts, noop_counter())
 
         selected = select_operator_to_run(
             topo,
@@ -696,7 +705,7 @@ def _get_op_usage_mocked(op):
         # Case 2: Should pick the `o1` since it has lowest object store usage
         _mock.return_value = [o1, o2, o3]
 
-        topo = build_streaming_topology(o3, opts)
+        topo = build_streaming_topology(o3, opts, noop_counter())
 
         selected = select_operator_to_run(
             topo,
@@ -749,7 +758,7 @@ def test_debug_dump_topology(ray_start_regular_shared):
         o2,
         DataContext.get_current(),
     )
-    topo = build_streaming_topology(o3, opt)
+    topo = build_streaming_topology(o3, opt, noop_counter())
     resource_manager = ResourceManager(
         topo,
         ExecutionOptions(),
@@ -869,7 +878,7 @@ def test_num_waiting_consumers_tracking(self):
         """num_waiting_consumers is incremented/decremented by get_output_blocking."""
         o1 = InputDataBuffer(ray.data.DataContext.get_current(), [])
         o2 = LimitOperator(1, o1, ray.data.DataContext.get_current())
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
         state = topo[o2]
 
         assert state.num_waiting_consumers == 0
@@ -903,7 +912,7 @@ def test_num_waiting_consumers_concurrent(self):
         For example, this happens for multiple streaming_split iterators."""
         o1 = InputDataBuffer(ray.data.DataContext.get_current(), [])
         o2 = LimitOperator(1, o1, ray.data.DataContext.get_current())
-        topo = build_streaming_topology(o2, ExecutionOptions())
+        topo = build_streaming_topology(o2, ExecutionOptions(), noop_counter())
         state = topo[o2]
 
         def blocking_consumer():
@@ -1230,7 +1239,9 @@ def test_create_topology_metadata():
     executor = StreamingExecutor(DataContext.get_current())
 
     # Initialize the topology on the executor
-    executor._topology = build_streaming_topology(o3, ExecutionOptions())
+    executor._topology = build_streaming_topology(
+        o3, ExecutionOptions(), noop_counter()
+    )
 
     # Call the _dump_dag_structure method
     op_to_id = {
@@ -1293,7 +1304,9 @@ def test_create_topology_metadata_with_sub_stages():
 
     # Create the executor and set up topology
     executor = StreamingExecutor(DataContext.get_current())
-    executor._topology = build_streaming_topology(o2, ExecutionOptions())
+    executor._topology = build_streaming_topology(
+        o2, ExecutionOptions(), noop_counter()
+    )
 
     # Get the DAG structure
     op_to_id = {
diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py
index 6a349071e6c9..463a6eac99a2 100644
--- a/python/ray/data/tests/unit/test_resource_manager.py
+++ b/python/ray/data/tests/unit/test_resource_manager.py
@@ -21,6 +21,7 @@
 from ray.data.block import BlockMetadata
 from ray.data.context import DataContext
 from ray.data.tests.conftest import *  # noqa
+from ray.data.tests.conftest import noop_counter
 
 
 def test_physical_operator_tracks_output_dependencies():
@@ -125,7 +126,7 @@ def test_does_not_double_count_usage_from_union():
     input1 = PhysicalOperator("op1", [], DataContext.get_current())
     input2 = PhysicalOperator("op2", [], DataContext.get_current())
     union_op = UnionOperator(DataContext.get_current(), input1, input2)
-    topology = build_streaming_topology(union_op, ExecutionOptions())
+    topology = build_streaming_topology(union_op, ExecutionOptions(), noop_counter())
 
     # Create a resource manager.
     total_resources = ExecutionResources(cpu=0, object_store_memory=2)
@@ -188,7 +189,7 @@ def test_per_input_inqueue_attribution_for_union():
 
     options = ExecutionOptions()
     options.preserve_order = True
-    topology = build_streaming_topology(union_op, options)
+    topology = build_streaming_topology(union_op, options, noop_counter())
 
     # Create a resource manager.
     total_resources = ExecutionResources(cpu=0, object_store_memory=200)

From ebe79cc94e4edf0754c8504b58416fda5d6190de Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Thu, 25 Jun 2026 13:03:12 -0700
Subject: [PATCH 44/53] Remove duplicate start call mock_all_to_all_op

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_resource_manager.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index 17bfa09dbb9c..e967514987e0 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -102,10 +102,6 @@ def mock_all_to_all_op(input_op, name="MockShuffle"):
         data_context=DataContext.get_current(),
         name=name,
     )
-    op.start(
-        ExecutionOptions(),
-        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
-    )
     return op
 
 

From d9dfee845542f6aff614f70738a7026511f50409 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 15:12:02 -0700
Subject: [PATCH 45/53] Fix pyrefly

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../operators/shuffle_operators/shuffle_reduce_operator.py    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
index 6981d99201a2..87b3db94063c 100644
--- a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
+++ b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_reduce_operator.py
@@ -211,7 +211,9 @@ def _emit_empty_partition(self, refs: RefBundle, schema: pa.Schema) -> None:
 
         # Empty partition creates a new block; register it for memory tracking.
         self._block_ref_counter.on_block_produced(
-            out_bundle.blocks[0].ref, block_meta.size_bytes or 0, self.id
+            out_bundle.blocks[0].ref,  # pyrefly: ignore[bad-argument-type]
+            block_meta.size_bytes or 0,
+            self.id,
         )
         self._num_reduce_tasks_submitted += 1
         self._output_queue.append(out_bundle)

From 83225e19e8e84176b99d9d0a19c53f6b04fb7d4b Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 16:03:02 -0700
Subject: [PATCH 46/53] Address missed start calls

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_gpu_shuffle.py    | 3 +++
 python/ray/data/tests/test_limit_operator.py | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/python/ray/data/tests/test_gpu_shuffle.py b/python/ray/data/tests/test_gpu_shuffle.py
index 61cf0cc1f4a7..d02de008755d 100644
--- a/python/ray/data/tests/test_gpu_shuffle.py
+++ b/python/ray/data/tests/test_gpu_shuffle.py
@@ -15,6 +15,7 @@
 import ray.data._internal.gpu_shuffle.hash_shuffle as hash_shuffle
 from ray.data._internal.execution.interfaces import (
     BlockEntry,
+    ExecutionOptions,
     ExecutionResources,
     PhysicalOperator,
     RefBundle,
@@ -31,6 +32,7 @@
 from ray.data._internal.util import explain_plan
 from ray.data.block import BlockMetadata
 from ray.data.context import DataContext, ShuffleStrategy
+from ray.data.tests.conftest import noop_counter
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -433,6 +435,7 @@ def _make_op(self, nranks=2, num_partitions=4):
             )
         op._rank_pool._actors = mock_actors
         op._rank_pool._nranks = nranks
+        op.start(ExecutionOptions(), noop_counter())
         return op, mock_actors
 
     def test_finalization_not_started_until_inputs_complete(self):
diff --git a/python/ray/data/tests/test_limit_operator.py b/python/ray/data/tests/test_limit_operator.py
index 022c9252376e..1cec3e2d2725 100644
--- a/python/ray/data/tests/test_limit_operator.py
+++ b/python/ray/data/tests/test_limit_operator.py
@@ -5,12 +5,14 @@
 import pytest
 
 import ray
+from ray.data._internal.execution.interfaces import ExecutionOptions
 from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer
 from ray.data._internal.execution.operators.limit_operator import LimitOperator
 from ray.data._internal.execution.streaming_executor import StreamingExecutor
 from ray.data._internal.execution.util import make_ref_bundles
 from ray.data._internal.logical.optimizers import get_execution_plan
 from ray.data.context import DataContext
+from ray.data.tests.conftest import noop_counter
 from ray.data.tests.util import run_op_tasks_sync
 from ray.tests.conftest import *  # noqa
 
@@ -60,6 +62,9 @@ def test_limit_operator(ray_start_regular_shared):
         refs = make_ref_bundles([[i] * num_rows_per_block for i in range(num_refs)])
         input_op = InputDataBuffer(DataContext.get_current(), refs)
         limit_op = LimitOperator(limit, input_op, DataContext.get_current())
+        counter = noop_counter()
+        input_op.start(ExecutionOptions(), counter)
+        limit_op.start(ExecutionOptions(), counter)
         limit_op.mark_execution_finished = MagicMock(
             wraps=limit_op.mark_execution_finished
         )

From 48e70657a7b362150ce47eaff56b3f22e40198dd Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Fri, 26 Jun 2026 16:23:22 -0700
Subject: [PATCH 47/53] Address start argument changes

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/conftest.py              | 2 +-
 python/ray/data/tests/test_resource_manager.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py
index b71e26a17883..b2ec834fbd76 100644
--- a/python/ray/data/tests/conftest.py
+++ b/python/ray/data/tests/conftest.py
@@ -45,7 +45,7 @@ def mock_all_to_all_op(input_op, name="MockAllToAll"):
         data_context=ray.data.DataContext.get_current(),
         name=name,
     )
-    op.start = MagicMock(side_effect=lambda _: None)
+    op.start = MagicMock(side_effect=lambda *_: None)
     return op
 
 
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index e967514987e0..6f8bcec11e80 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -66,7 +66,7 @@ def mock_union_op(input_ops):
         DataContext.get_current(),
         *input_ops,
     )
-    op.start = MagicMock(side_effect=lambda _: None)
+    op.start = MagicMock(side_effect=lambda *_: None)
     return op
 
 
@@ -90,7 +90,7 @@ def mock_join_op(left_input_op, right_input_op):
             partition_size_hint=1,
         )
 
-    op.start = MagicMock(side_effect=lambda _: None)
+    op.start = MagicMock(side_effect=lambda *_: None)
     return op
 
 

From bd7d6d2944b8bb5eb46e3e60c2d1637dbde3a3e5 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 17 Jun 2026 15:19:30 -0700
Subject: [PATCH 48/53] Adjust resource manager object store memory tracking
 logic

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../_internal/execution/resource_manager.py   |  15 +-
 .../ray/data/tests/test_resource_manager.py   | 250 +++++++-----------
 .../data/tests/unit/test_resource_manager.py  |  10 +
 3 files changed, 116 insertions(+), 159 deletions(-)

diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
index aba473be1992..2df548ff88ec 100644
--- a/python/ray/data/_internal/execution/resource_manager.py
+++ b/python/ray/data/_internal/execution/resource_manager.py
@@ -187,15 +187,16 @@ def _estimate_object_store_memory_usage(
                 return self._external_consumer_bytes
             return 0
 
-        usage = op.estimate_object_store_usage(state)
-        self._mem_op_internal[op] = usage.internal
-        self._mem_op_outputs[op] = usage.outputs
+        # Blocks being generated by running tasks but not yet yielded as ObjectRefs.
+        mem_op_internal = op.metrics.obj_store_mem_pending_task_outputs or 0
 
-        # Attribute iterator / streaming_split prefetch to the executor sink only.
-        if op is self._output_operator:
-            self._mem_op_outputs[op] += self._external_consumer_bytes
+        # All blocks produced by this operator that are still live (in any queue or
+        # held by any running task), as tracked by the block reference counter.
+        mem_op_outputs = self._block_ref_counter.get_object_store_memory_usage(op.id)
 
-        return self._mem_op_outputs[op] + self._mem_op_internal[op]
+        self._mem_op_internal[op] = mem_op_internal
+        self._mem_op_outputs[op] = mem_op_outputs
+        return mem_op_internal + mem_op_outputs
 
     def update_usages(self):
         """Recalculate resource usages."""
diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index 6f8bcec11e80..0085c5985967 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -1,6 +1,5 @@
 import math
 import time
-from dataclasses import replace
 from datetime import timedelta
 from typing import Any, Dict, Optional
 from unittest.mock import MagicMock, patch
@@ -8,7 +7,6 @@
 import pytest
 from freezegun import freeze_time
 
-import ray
 from ray.data._internal.compute import ComputeStrategy
 from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import BlockEntry, PhysicalOperator
@@ -16,10 +14,6 @@
     ExecutionOptions,
     ExecutionResources,
 )
-from ray.data._internal.execution.interfaces.physical_operator import (
-    ObjectStoreUsage,
-    TaskExecDriverStats,
-)
 from ray.data._internal.execution.operators.base_physical_operator import (
     AllToAllOperator,
 )
@@ -37,8 +31,6 @@
     OutputBackpressureGuard,
     build_streaming_topology,
 )
-from ray.data._internal.execution.util import make_ref_bundles
-from ray.data.block import TaskExecWorkerStats
 from ray.data.context import DataContext
 from ray.data.tests.conftest import *  # noqa
 from ray.data.tests.conftest import noop_counter
@@ -217,43 +209,36 @@ def test_global_limits_cache(self):
             assert get_total_resources.call_count == 2
 
     def test_update_usage(self):
-        """Test calculating op_usage."""
+        """Test calculating op_usage.
+
+        Object-store memory is now tracked via BlockRefCounter (one counter shared
+        across the whole execution), NOT via queue-size metrics.  Each operator's
+        object-store usage = pending_task_outputs (in-flight generator buffer) +
+        block_ref_counter bytes attributed to that operator.
+        """
         o1 = InputDataBuffer(DataContext.get_current(), [])
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
         topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
 
-        # Mock different metrics that contribute to the resource usage.
         mock_cpu = {
             o1: 0,
             o2: 5,
             o3: 8,
         }
+        # Bytes in the streaming-generator buffer (not yet yielded as ObjectRefs).
         mock_pending_task_outputs = {
             o1: 0,
             o2: 100,
             o3: 200,
         }
-        mock_internal_outqueue = {
+        # Bytes attributed to each operator in the BlockRefCounter (produced ObjectRefs
+        # that are still live somewhere in the pipeline).
+        mock_counter_bytes = {
             o1: 0,
             o2: 300,
             o3: 400,
         }
-        mock_external_outqueue_sizes = {
-            o1: 100,
-            o2: 500,
-            o3: 600,
-        }
-        mock_internal_inqueue = {
-            o1: 0,
-            o2: 700,
-            o3: 800,
-        }
-        mock_pending_task_inputs = {
-            o1: 0,
-            o2: 900,
-            o3: 1000,
-        }
 
         for op in [o1, o2, o3]:
             op.current_logical_usage = MagicMock(
@@ -266,18 +251,7 @@ def test_update_usage(self):
             op.extra_resource_usage = MagicMock(return_value=ExecutionResources.zero())
             op._metrics = MagicMock(
                 obj_store_mem_pending_task_outputs=mock_pending_task_outputs[op],
-                obj_store_mem_internal_outqueue=mock_internal_outqueue[op],
-                obj_store_mem_internal_inqueue=mock_internal_inqueue[op],
-                obj_store_mem_pending_task_inputs=mock_pending_task_inputs[op],
-            )
-            op._metrics.obj_store_mem_internal_inqueue_for_input = MagicMock(
-                return_value=mock_internal_inqueue[op],
-            )
-            ref_bundle = MagicMock(
-                size_bytes=MagicMock(return_value=mock_external_outqueue_sizes[op]),
-                output_split_idx=None,
             )
-            topo[op].add_output(ref_bundle)
 
         resource_manager = ResourceManager(
             topo,
@@ -287,39 +261,34 @@ def test_update_usage(self):
             BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager._op_resource_allocator = None
+
+        # Seed the counter so each eligible op reports its expected counter bytes.
+        for op in [o2, o3]:
+            if mock_counter_bytes[op]:
+                resource_manager.block_ref_counter._bytes_by_producer[
+                    op.id
+                ] = mock_counter_bytes[op]
+
         resource_manager.update_usages()
 
         global_cpu = 0
         global_mem = 0
         for op in [o1, o2, o3]:
             if op == o1:
-                # Resource usage of InputDataBuffer doesn't count.
+                # InputDataBuffer memory is not counted.
                 expected_mem = 0
             else:
-                expected_mem = (
-                    mock_pending_task_outputs[op]
-                    + mock_internal_outqueue[op]
-                    + mock_external_outqueue_sizes[op]
-                )
-                for next_op in op.output_dependencies:
-                    expected_mem += (
-                        +mock_internal_inqueue[next_op]
-                        + mock_pending_task_inputs[next_op]
-                    )
+                expected_mem = mock_pending_task_outputs[op] + mock_counter_bytes[op]
             op_usage = resource_manager.get_op_usage(op)
             assert op_usage.cpu == mock_cpu[op]
             assert op_usage.gpu == 0
             assert op_usage.object_store_memory == expected_mem
             if op != o1:
-                # _mem_op_internal only includes pending_task_outputs
                 assert (
                     resource_manager._mem_op_internal[op]
                     == mock_pending_task_outputs[op]
                 )
-                assert (
-                    resource_manager._mem_op_outputs[op]
-                    == expected_mem - resource_manager._mem_op_internal[op]
-                )
+                assert resource_manager._mem_op_outputs[op] == mock_counter_bytes[op]
             global_cpu += mock_cpu[op]
             global_mem += expected_mem
 
@@ -328,16 +297,15 @@ def test_update_usage(self):
         )
 
     def test_object_store_usage(self, restore_data_context):
-        input = make_ref_bundles([[x] for x in range(1)])[0]
-        # Set block metadata size_bytes to 1 (rather than mocking the method on the
-        # instance, which doesn't survive dataclasses.replace in OpBufferQueue.pop).
-        entry = input.blocks[0]
-        input = replace(
-            input,
-            blocks=[BlockEntry(entry.ref, replace(entry.metadata, size_bytes=1))],
-        )
+        """Object-store usage is tracked via BlockRefCounter, not queue metrics.
 
-        o1 = InputDataBuffer(DataContext.get_current(), [input])
+        Memory is attributed to the operator that called on_block_produced and
+        released automatically via the Ray Core out-of-scope callback.  This test
+        drives the counter directly (bypassing Ray Core) to verify the integration
+        between ResourceManager._estimate_object_store_memory_usage and the counter.
+        """
+
+        o1 = InputDataBuffer(DataContext.get_current(), [])
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
@@ -349,124 +317,102 @@ def test_object_store_usage(self, restore_data_context):
             DataContext.get_current(),
             BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
-        ray.data.DataContext.get_current()._max_num_blocks_in_streaming_gen_buffer = 1
-        ray.data.DataContext.get_current().target_max_block_size = 2
 
-        resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        assert resource_manager.get_op_usage(o2).object_store_memory == 0
-        assert resource_manager.get_op_usage(o3).object_store_memory == 0
+        counter = resource_manager.block_ref_counter
 
-        # Objects in an operator's internal inqueue typically count toward the previous
-        # operator's object store memory usage. However, data from an
-        # `InputDataBuffer` aren't counted because they were created outside of this
-        # execution.
-        o2.metrics.on_input_queued(input, input_index=0)
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o1).object_store_memory == 0
         assert resource_manager.get_op_usage(o2).object_store_memory == 0
         assert resource_manager.get_op_usage(o3).object_store_memory == 0
 
-        # During no-sample phase, obj_store_mem_pending_task_outputs uses fallback
-        # estimate based on target_max_block_size.
-        o2.metrics.on_input_dequeued(input, input_index=0)
-        o2.metrics.on_task_submitted(0, input)
-        resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        # No sample available yet, returns None
-        assert o2.metrics.obj_store_mem_pending_task_outputs is None
-        op2_usage = resource_manager.get_op_usage(o2).object_store_memory
-        # When pending task outputs is None, it's treated as 0
-        assert op2_usage == 0
-        assert resource_manager.get_op_usage(o3).object_store_memory == 0
+        # InputDataBuffer blocks are not registered — its usage stays 0.
+        assert counter.get_object_store_memory_usage(o1.id) == 0
 
-        # When the task finishes, we move the data from the streaming generator to the
-        # operator's internal outqueue.
-        o2.metrics.on_output_queued(input)
-        o2.metrics.on_task_finished(
-            0,
-            None,
-            TaskExecWorkerStats(task_wall_time_s=0.0),
-            TaskExecDriverStats(task_output_backpressure_s=0),
-        )
-        resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        assert resource_manager.get_op_usage(o2).object_store_memory == 1
-        assert resource_manager.get_op_usage(o3).object_store_memory == 0
+        # Simulate o2 producing a 100-byte block.
+        fake_id1 = (1).to_bytes(28, "big")
+        with counter._lock:
+            counter._registered_ids.add(fake_id1)
+            counter._bytes_by_producer[o2.id] += 100
 
-        o2.metrics.on_output_dequeued(input)
-        topo[o2].output_queue.append(input)
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        assert resource_manager.get_op_usage(o2).object_store_memory == 1
+        assert resource_manager.get_op_usage(o2).object_store_memory == 100
         assert resource_manager.get_op_usage(o3).object_store_memory == 0
 
-        # Objects in the current operator's internal inqueue count towards the previous
-        # operator's object store memory usage.
-        # NOTE: `pop()` returns a copy of the bundle (via `dataclasses.replace`), so we
-        # must use the returned reference for subsequent o3 metric calls.
-        o3_input = topo[o2].output_queue.pop()
-        o3.metrics.on_input_queued(o3_input, input_index=0)
+        # Simulate o3 producing a 200-byte block.
+        fake_id2 = (2).to_bytes(28, "big")
+        with counter._lock:
+            counter._registered_ids.add(fake_id2)
+            counter._bytes_by_producer[o3.id] += 200
+
         resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        assert resource_manager.get_op_usage(o2).object_store_memory == 1
-        assert resource_manager.get_op_usage(o3).object_store_memory == 0
+        assert resource_manager.get_op_usage(o2).object_store_memory == 100
+        assert resource_manager.get_op_usage(o3).object_store_memory == 200
+
+        # Simulate Ray Core callback firing for o2's block (all refs dropped).
+        with counter._lock:
+            counter._registered_ids.discard(fake_id1)
+            counter._bytes_by_producer[o2.id] -= 100
 
-        # Task inputs count toward the previous operator's object store memory
-        # usage. During no-sample phase, pending task outputs uses fallback estimate.
-        o3.metrics.on_input_dequeued(o3_input, input_index=0)
-        o3.metrics.on_task_submitted(0, o3_input)
         resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
-        assert resource_manager.get_op_usage(o2).object_store_memory == 1
-        # No sample available yet, returns None
-        assert o3.metrics.obj_store_mem_pending_task_outputs is None
-        op3_usage = resource_manager.get_op_usage(o3).object_store_memory
-        # When pending task outputs is None, it's treated as 0
-        assert op3_usage == 0
-
-        # Task inputs no longer count once the task is finished.
-        o3.metrics.on_output_queued(o3_input)
-        o3.metrics.on_task_finished(
-            0,
-            None,
-            TaskExecWorkerStats(task_wall_time_s=0.0),
-            TaskExecDriverStats(task_output_backpressure_s=0),
-        )
+        assert resource_manager.get_op_usage(o2).object_store_memory == 0
+        assert resource_manager.get_op_usage(o3).object_store_memory == 200
+
+        # After clear(), all usage resets to 0.
+        counter.clear()
         resource_manager.update_usages()
-        assert resource_manager.get_op_usage(o1).object_store_memory == 0
         assert resource_manager.get_op_usage(o2).object_store_memory == 0
-        assert resource_manager.get_op_usage(o3).object_store_memory == 1
-
-    def test_object_store_accounting_delegates_to_op(self, restore_data_context):
-        """``ResourceManager`` must dispatch to ``op.estimate_object_store_usage`` so subclasses can override the accounting."""
-        # Real upstream so the override op has a valid input dependency.
-        input = make_ref_bundles([[x] for x in range(1)])[0]
-        upstream = InputDataBuffer(DataContext.get_current(), [input])
-
-        # Subclass that overrides the accounting to return hard-coded
-        # values — bypasses the generic metrics+state computation.
-        override = mock_map_op(upstream)
-        override.estimate_object_store_usage = lambda state: ObjectStoreUsage(
-            internal=42, outputs=100
-        )
+        assert resource_manager.get_op_usage(o3).object_store_memory == 0
+
+    def test_union_no_double_counting(self, restore_data_context):
+        """Blocks passing through UnionOperator are attributed to their original
+        producer, not double-counted.
+
+        UnionOperator is a passthrough — it never calls on_block_produced, so its
+        block_ref_counter usage is always 0.  Blocks in Union's output queue are
+        already captured by the upstream MapOperator's counter entry.  Global usage
+        = sum of upstream producers only, without any inflation from Union.
+        """
 
-        topo = build_streaming_topology(override, ExecutionOptions(), noop_counter())
+        o1 = InputDataBuffer(DataContext.get_current(), [])
+        map_a = mock_map_op(o1, name="MapA")
+        o2 = InputDataBuffer(DataContext.get_current(), [])
+        map_b = mock_map_op(o2, name="MapB")
+        union_op = mock_union_op([map_a, map_b])
+        downstream = mock_map_op(union_op, name="Downstream")
+
+        topo = build_streaming_topology(downstream, ExecutionOptions(), noop_counter())
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
-            MagicMock(return_value=ExecutionResources.zero()),
+            MagicMock(return_value=ExecutionResources(object_store_memory=10_000)),
             DataContext.get_current(),
             BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
+        counter = resource_manager.block_ref_counter
+
+        # Simulate map_a producing a 100-byte block and map_b producing 200 bytes.
+        fake_a = (10).to_bytes(28, "big")
+        fake_b = (20).to_bytes(28, "big")
+        with counter._lock:
+            counter._registered_ids.add(fake_a)
+            counter._bytes_by_producer[map_a.id] += 100
+            counter._registered_ids.add(fake_b)
+            counter._bytes_by_producer[map_b.id] += 200
 
         resource_manager.update_usages()
 
-        # The override's hard-coded values flow through unchanged into
-        # both the per-component dicts and the aggregated op usage.
-        assert resource_manager.get_mem_op_internal(override) == 42
-        assert resource_manager.get_mem_op_outputs(override) == 100
-        assert resource_manager.get_op_usage(override).object_store_memory == 42 + 100
+        # map_a and map_b see their own bytes.
+        assert resource_manager.get_op_usage(map_a).object_store_memory == 100
+        assert resource_manager.get_op_usage(map_b).object_store_memory == 200
+
+        # union_op itself has 0 bytes (it never calls on_block_produced).
+        assert resource_manager.get_op_usage(union_op).object_store_memory == 0
+
+        # Global usage = map_a (100) + map_b (200) only, not inflated by Union.
+        # InputDataBuffer and downstream (no blocks yet) contribute 0.
+        total_obj_store = resource_manager.get_global_usage().object_store_memory
+        assert total_obj_store == 300
 
     def test_get_completed_ops_usage(self, restore_data_context):
         """Test that _get_completed_ops_usage returns total usage of completed ops."""
diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py
index 463a6eac99a2..8edab188fa5a 100644
--- a/python/ray/data/tests/unit/test_resource_manager.py
+++ b/python/ray/data/tests/unit/test_resource_manager.py
@@ -154,6 +154,13 @@ def test_does_not_double_count_usage_from_union():
     # Add two 1-byte `RefBundle` to the union operator.
     topology[union_op].add_output(bundle1)
     topology[union_op].add_output(bundle2)
+    # With BlockRefCounter, blocks are attributed to their original producer, not to
+    # union_op (which is a pass-through and produces no new ObjectRefs). Simulate
+    # real execution: bundle1 came from input1, bundle2 from input2.
+    # Bypass the Ray core-worker callback since these are fake ObjectRefs.
+    counter = resource_manager.block_ref_counter
+    counter._bytes_by_producer[input1.id] = 1
+    counter._bytes_by_producer[input2.id] = 1
     resource_manager.update_usages()
 
     # The total object store memory usage should be 2. If the resource manager double-
@@ -219,6 +226,9 @@ def test_per_input_inqueue_attribution_for_union():
     # With preserve_order=True, _add_input_inner routes to _input_buffers[input_index].
     union_op.add_input(bundle1, input_index=1)
     union_op.add_input(bundle2, input_index=1)
+    # With BlockRefCounter, blocks in union's input buffer are attributed to the
+    # operator that produced them (input2, not input1 or union_op).
+    resource_manager.block_ref_counter._bytes_by_producer[input2.id] = 20
 
     resource_manager.update_usages()
 

From 03b08ac21862f7e505b1ea3d80379c12fe4032d9 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 18:28:42 -0700
Subject: [PATCH 49/53] Address reviews + remove implementation detail
 descriptions from test docstring

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../ray/data/tests/test_resource_manager.py   | 104 ++++++------------
 1 file changed, 34 insertions(+), 70 deletions(-)

diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index 0085c5985967..454a43860e88 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -9,7 +9,7 @@
 
 from ray.data._internal.compute import ComputeStrategy
 from ray.data._internal.execution.block_ref_counter import BlockRefCounter
-from ray.data._internal.execution.interfaces import BlockEntry, PhysicalOperator
+from ray.data._internal.execution.interfaces import PhysicalOperator
 from ray.data._internal.execution.interfaces.execution_options import (
     ExecutionOptions,
     ExecutionResources,
@@ -36,6 +36,22 @@
 from ray.data.tests.conftest import noop_counter
 
 
+class StubBlockRefCounter:
+    """Test double for BlockRefCounter with directly settable per-operator usage."""
+
+    def __init__(self):
+        self._usage = {}
+
+    def set_usage(self, producer_id: str, bytes: int) -> None:
+        self._usage[producer_id] = bytes
+
+    def get_object_store_memory_usage(self, producer_id: str) -> int:
+        return self._usage.get(producer_id, 0)
+
+    def clear(self) -> None:
+        self._usage.clear()
+
+
 def mock_map_op(
     input_op: PhysicalOperator,
     ray_remote_args: Optional[Dict[str, Any]] = None,
@@ -209,13 +225,7 @@ def test_global_limits_cache(self):
             assert get_total_resources.call_count == 2
 
     def test_update_usage(self):
-        """Test calculating op_usage.
-
-        Object-store memory is now tracked via BlockRefCounter (one counter shared
-        across the whole execution), NOT via queue-size metrics.  Each operator's
-        object-store usage = pending_task_outputs (in-flight generator buffer) +
-        block_ref_counter bytes attributed to that operator.
-        """
+        """Test calculating op_usage."""
         o1 = InputDataBuffer(DataContext.get_current(), [])
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
@@ -226,14 +236,11 @@ def test_update_usage(self):
             o2: 5,
             o3: 8,
         }
-        # Bytes in the streaming-generator buffer (not yet yielded as ObjectRefs).
         mock_pending_task_outputs = {
             o1: 0,
             o2: 100,
             o3: 200,
         }
-        # Bytes attributed to each operator in the BlockRefCounter (produced ObjectRefs
-        # that are still live somewhere in the pipeline).
         mock_counter_bytes = {
             o1: 0,
             o2: 300,
@@ -253,21 +260,19 @@ def test_update_usage(self):
                 obj_store_mem_pending_task_outputs=mock_pending_task_outputs[op],
             )
 
+        counter = StubBlockRefCounter(add_object_out_of_scope_callback=lambda *_: True)
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
             MagicMock(),
             DataContext.get_current(),
-            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+            counter,
         )
         resource_manager._op_resource_allocator = None
 
-        # Seed the counter so each eligible op reports its expected counter bytes.
         for op in [o2, o3]:
             if mock_counter_bytes[op]:
-                resource_manager.block_ref_counter._bytes_by_producer[
-                    op.id
-                ] = mock_counter_bytes[op]
+                counter.set_usage(op.id, mock_counter_bytes[op])
 
         resource_manager.update_usages()
 
@@ -297,63 +302,42 @@ def test_update_usage(self):
         )
 
     def test_object_store_usage(self, restore_data_context):
-        """Object-store usage is tracked via BlockRefCounter, not queue metrics.
-
-        Memory is attributed to the operator that called on_block_produced and
-        released automatically via the Ray Core out-of-scope callback.  This test
-        drives the counter directly (bypassing Ray Core) to verify the integration
-        between ResourceManager._estimate_object_store_memory_usage and the counter.
-        """
+        """ResourceManager reads per-operator memory from BlockRefCounter."""
 
         o1 = InputDataBuffer(DataContext.get_current(), [])
         o2 = mock_map_op(o1)
         o3 = mock_map_op(o2)
 
         topo = build_streaming_topology(o3, ExecutionOptions(), noop_counter())
+        counter = StubBlockRefCounter()
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources.zero()),
             DataContext.get_current(),
-            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+            counter,
         )
 
-        counter = resource_manager.block_ref_counter
-
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o1).object_store_memory == 0
         assert resource_manager.get_op_usage(o2).object_store_memory == 0
         assert resource_manager.get_op_usage(o3).object_store_memory == 0
 
-        # InputDataBuffer blocks are not registered — its usage stays 0.
-        assert counter.get_object_store_memory_usage(o1.id) == 0
-
         # Simulate o2 producing a 100-byte block.
-        fake_id1 = (1).to_bytes(28, "big")
-        with counter._lock:
-            counter._registered_ids.add(fake_id1)
-            counter._bytes_by_producer[o2.id] += 100
-
+        counter.set_usage(o2.id, 100)
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o1).object_store_memory == 0
         assert resource_manager.get_op_usage(o2).object_store_memory == 100
         assert resource_manager.get_op_usage(o3).object_store_memory == 0
 
         # Simulate o3 producing a 200-byte block.
-        fake_id2 = (2).to_bytes(28, "big")
-        with counter._lock:
-            counter._registered_ids.add(fake_id2)
-            counter._bytes_by_producer[o3.id] += 200
-
+        counter.set_usage(o3.id, 200)
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o2).object_store_memory == 100
         assert resource_manager.get_op_usage(o3).object_store_memory == 200
 
-        # Simulate Ray Core callback firing for o2's block (all refs dropped).
-        with counter._lock:
-            counter._registered_ids.discard(fake_id1)
-            counter._bytes_by_producer[o2.id] -= 100
-
+        # Simulate o2's block being freed.
+        counter.set_usage(o2.id, 0)
         resource_manager.update_usages()
         assert resource_manager.get_op_usage(o2).object_store_memory == 0
         assert resource_manager.get_op_usage(o3).object_store_memory == 200
@@ -365,14 +349,7 @@ def test_object_store_usage(self, restore_data_context):
         assert resource_manager.get_op_usage(o3).object_store_memory == 0
 
     def test_union_no_double_counting(self, restore_data_context):
-        """Blocks passing through UnionOperator are attributed to their original
-        producer, not double-counted.
-
-        UnionOperator is a passthrough — it never calls on_block_produced, so its
-        block_ref_counter usage is always 0.  Blocks in Union's output queue are
-        already captured by the upstream MapOperator's counter entry.  Global usage
-        = sum of upstream producers only, without any inflation from Union.
-        """
+        """UnionOperator passthrough does not inflate global memory usage."""
 
         o1 = InputDataBuffer(DataContext.get_current(), [])
         map_a = mock_map_op(o1, name="MapA")
@@ -382,35 +359,24 @@ def test_union_no_double_counting(self, restore_data_context):
         downstream = mock_map_op(union_op, name="Downstream")
 
         topo = build_streaming_topology(downstream, ExecutionOptions(), noop_counter())
+        counter = StubBlockRefCounter()
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
             MagicMock(return_value=ExecutionResources(object_store_memory=10_000)),
             DataContext.get_current(),
-            BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+            counter,
         )
-        counter = resource_manager.block_ref_counter
 
-        # Simulate map_a producing a 100-byte block and map_b producing 200 bytes.
-        fake_a = (10).to_bytes(28, "big")
-        fake_b = (20).to_bytes(28, "big")
-        with counter._lock:
-            counter._registered_ids.add(fake_a)
-            counter._bytes_by_producer[map_a.id] += 100
-            counter._registered_ids.add(fake_b)
-            counter._bytes_by_producer[map_b.id] += 200
+        counter.set_usage(map_a.id, 100)
+        counter.set_usage(map_b.id, 200)
 
         resource_manager.update_usages()
 
-        # map_a and map_b see their own bytes.
         assert resource_manager.get_op_usage(map_a).object_store_memory == 100
         assert resource_manager.get_op_usage(map_b).object_store_memory == 200
-
-        # union_op itself has 0 bytes (it never calls on_block_produced).
         assert resource_manager.get_op_usage(union_op).object_store_memory == 0
 
-        # Global usage = map_a (100) + map_b (200) only, not inflated by Union.
-        # InputDataBuffer and downstream (no blocks yet) contribute 0.
         total_obj_store = resource_manager.get_global_usage().object_store_memory
         assert total_obj_store == 300
 
@@ -794,9 +760,7 @@ def test_memory_limit_blocks_task_submission(self, restore_data_context):
             options=options,
             get_total_resources=lambda: cluster_resources,
             data_context=DataContext.get_current(),
-            block_ref_counter=BlockRefCounter(
-                add_object_out_of_scope_callback=lambda *_: True
-            ),
+            block_ref_counter=BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
         )
         resource_manager.update_usages()
 

From 4d0fa2fec9edc0ed69bc0b2fc2935d91f470e886 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 22 Jun 2026 18:38:57 -0700
Subject: [PATCH 50/53] Also address resource manager unit test by switching to
 StubBlockRefCounter

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../data/tests/unit/test_resource_manager.py  | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py
index 8edab188fa5a..ed7239309d3b 100644
--- a/python/ray/data/tests/unit/test_resource_manager.py
+++ b/python/ray/data/tests/unit/test_resource_manager.py
@@ -1,7 +1,6 @@
 import pytest
 
 import ray
-from ray.data._internal.execution.block_ref_counter import BlockRefCounter
 from ray.data._internal.execution.interfaces import (
     BlockEntry,
     PhysicalOperator,
@@ -22,6 +21,7 @@
 from ray.data.context import DataContext
 from ray.data.tests.conftest import *  # noqa
 from ray.data.tests.conftest import noop_counter
+from ray.data.tests.test_resource_manager import StubBlockRefCounter
 
 
 def test_physical_operator_tracks_output_dependencies():
@@ -130,12 +130,13 @@ def test_does_not_double_count_usage_from_union():
 
     # Create a resource manager.
     total_resources = ExecutionResources(cpu=0, object_store_memory=2)
+    counter = StubBlockRefCounter()
     resource_manager = ResourceManager(
         topology,
         ExecutionOptions(),
         lambda: total_resources,
         DataContext.get_current(),
-        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+        counter,
     )
 
     # Create two 1-byte `RefBundle`s.
@@ -154,13 +155,9 @@ def test_does_not_double_count_usage_from_union():
     # Add two 1-byte `RefBundle` to the union operator.
     topology[union_op].add_output(bundle1)
     topology[union_op].add_output(bundle2)
-    # With BlockRefCounter, blocks are attributed to their original producer, not to
-    # union_op (which is a pass-through and produces no new ObjectRefs). Simulate
-    # real execution: bundle1 came from input1, bundle2 from input2.
-    # Bypass the Ray core-worker callback since these are fake ObjectRefs.
-    counter = resource_manager.block_ref_counter
-    counter._bytes_by_producer[input1.id] = 1
-    counter._bytes_by_producer[input2.id] = 1
+    # Blocks are attributed to their original producer, not union_op.
+    counter.set_usage(input1.id, 1)
+    counter.set_usage(input2.id, 1)
     resource_manager.update_usages()
 
     # The total object store memory usage should be 2. If the resource manager double-
@@ -200,12 +197,13 @@ def test_per_input_inqueue_attribution_for_union():
 
     # Create a resource manager.
     total_resources = ExecutionResources(cpu=0, object_store_memory=200)
+    counter = StubBlockRefCounter()
     resource_manager = ResourceManager(
         topology,
         options,
         lambda: total_resources,
         DataContext.get_current(),
-        BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+        counter,
     )
 
     # Create two 10-byte RefBundles with distinct block refs (simulates real execution
@@ -226,9 +224,8 @@ def test_per_input_inqueue_attribution_for_union():
     # With preserve_order=True, _add_input_inner routes to _input_buffers[input_index].
     union_op.add_input(bundle1, input_index=1)
     union_op.add_input(bundle2, input_index=1)
-    # With BlockRefCounter, blocks in union's input buffer are attributed to the
-    # operator that produced them (input2, not input1 or union_op).
-    resource_manager.block_ref_counter._bytes_by_producer[input2.id] = 20
+    # Blocks in union's input buffer are attributed to their producer (input2).
+    counter.set_usage(input2.id, 20)
 
     resource_manager.update_usages()
 

From eac1e50646ff7dd7c989a77c5fc2a28e1d952025 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 19:47:57 -0700
Subject: [PATCH 51/53] Remove dead code

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 .../execution/interfaces/physical_operator.py | 53 -------------------
 .../shuffle_operators/shuffle_map_operator.py |  4 --
 2 files changed, 57 deletions(-)

diff --git a/python/ray/data/_internal/execution/interfaces/physical_operator.py b/python/ray/data/_internal/execution/interfaces/physical_operator.py
index 17373f07e269..792d905c8b1e 100644
--- a/python/ray/data/_internal/execution/interfaces/physical_operator.py
+++ b/python/ray/data/_internal/execution/interfaces/physical_operator.py
@@ -38,7 +38,6 @@
 
 if TYPE_CHECKING:
 
-    from ray.data._internal.execution.streaming_executor_state import OpState
     from ray.data.block import BlockMetadataWithSchema
 
 logger = logging.getLogger(__name__)
@@ -53,23 +52,6 @@
 Waitable = Union[ray.ObjectRef, ObjectRefGenerator]
 
 
-@dataclass(frozen=True)
-class ObjectStoreUsage:
-    """Per-op object store accounting.
-
-    Attributes:
-        internal: Bytes held by this op's currently-running tasks
-            (outputs not yet yielded to the object store).
-        outputs: Bytes this op has produced that are still live in
-            the object store — its internal output queue, its
-            ``OpState`` external output queue, and the downstream
-            eligible ops' inputs.
-    """
-
-    internal: int
-    outputs: int
-
-
 class OpTask(ABC):
     """Abstract class that represents a task that is created by an PhysicalOperator.
 
@@ -904,41 +886,6 @@ def current_logical_usage(self) -> ExecutionResources:
         """
         return ExecutionResources.zero()
 
-    def estimate_object_store_usage(self, state: "OpState") -> ObjectStoreUsage:
-        """Returns the bytes this operator contributes to the global object
-        store budget. Subclasses may override this when their object store
-        footprint doesn't match the generic model.
-        """
-        # Operator's internal Object Store usage
-        mem_op_internal = self.metrics.obj_store_mem_pending_task_outputs or 0
-
-        # Operator's outputs' Object Store usage
-        op_outputs_bytes = (
-            # Internal output queue
-            self.metrics.obj_store_mem_internal_outqueue
-            +
-            # External output queue
-            state.output_queue_bytes()
-        )
-
-        # TODO fix ineligible ops: this needs to include usage of all of OS
-        #      for ineligible ops
-        #
-        # Outputs of this operator used downstream
-        used_op_outputs_bytes = sum(
-            (
-                downstream_op.metrics.obj_store_mem_internal_inqueue_for_input(
-                    downstream_op.input_dependencies.index(self)
-                )
-                + downstream_op.metrics.obj_store_mem_pending_task_inputs
-            )
-            for downstream_op in self.output_dependencies
-        )
-        return ObjectStoreUsage(
-            internal=int(mem_op_internal),
-            outputs=int(op_outputs_bytes + used_op_outputs_bytes),
-        )
-
     def running_logical_usage(self) -> ExecutionResources:
         """Returns the estimated running CPU, GPU, and memory usage of this operator,
         excluding object store memory.
diff --git a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_map_operator.py b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_map_operator.py
index 2f3c7070b115..82daa608d057 100644
--- a/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_map_operator.py
+++ b/python/ray/data/_internal/execution/operators/shuffle_operators/shuffle_map_operator.py
@@ -18,7 +18,6 @@
 )
 from ray.data._internal.execution.interfaces.physical_operator import (
     MetadataOpTask,
-    ObjectStoreUsage,
     OpTask,
     estimate_total_num_of_blocks,
 )
@@ -422,9 +421,6 @@ def current_logical_usage(self) -> ExecutionResources:
             memory=self._map_resource_usage.memory,
         )
 
-    def estimate_object_store_usage(self, state) -> ObjectStoreUsage:
-        return ObjectStoreUsage(internal=0, outputs=0)
-
     def incremental_resource_usage(self) -> ExecutionResources:
         avg_input = self._metrics.average_bytes_inputs_per_task
         memory = int(avg_input * SHUFFLE_PEAK_MEMORY_MULTIPLIER) if avg_input else 0

From b79a2a94ebfc063205afe6b72f7e7d585f3fc464 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Wed, 24 Jun 2026 20:59:16 -0700
Subject: [PATCH 52/53] Fix StubBlockRefCounter argument

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_resource_manager.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index 454a43860e88..c43ca608f2e1 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -260,7 +260,7 @@ def test_update_usage(self):
                 obj_store_mem_pending_task_outputs=mock_pending_task_outputs[op],
             )
 
-        counter = StubBlockRefCounter(add_object_out_of_scope_callback=lambda *_: True)
+        counter = StubBlockRefCounter()
         resource_manager = ResourceManager(
             topo,
             ExecutionOptions(),
@@ -760,7 +760,9 @@ def test_memory_limit_blocks_task_submission(self, restore_data_context):
             options=options,
             get_total_resources=lambda: cluster_resources,
             data_context=DataContext.get_current(),
-            block_ref_counter=BlockRefCounter(add_object_out_of_scope_callback=lambda *_: True),
+            block_ref_counter=BlockRefCounter(
+                add_object_out_of_scope_callback=lambda *_: True
+            ),
         )
         resource_manager.update_usages()
 

From addb2185888bdd8b822582ed95ebd45bf7ca2cb1 Mon Sep 17 00:00:00 2001
From: Sirui Huang <ray.huang@anyscale.com>
Date: Mon, 29 Jun 2026 11:49:55 -0700
Subject: [PATCH 53/53] Inherit BlockRefCounter for StubBlockRefCounter

Signed-off-by: Sirui Huang <ray.huang@anyscale.com>
---
 python/ray/data/tests/test_resource_manager.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/ray/data/tests/test_resource_manager.py b/python/ray/data/tests/test_resource_manager.py
index c43ca608f2e1..df4f0e2ca80e 100644
--- a/python/ray/data/tests/test_resource_manager.py
+++ b/python/ray/data/tests/test_resource_manager.py
@@ -36,20 +36,14 @@
 from ray.data.tests.conftest import noop_counter
 
 
-class StubBlockRefCounter:
+class StubBlockRefCounter(BlockRefCounter):
     """Test double for BlockRefCounter with directly settable per-operator usage."""
 
     def __init__(self):
-        self._usage = {}
+        super().__init__(add_object_out_of_scope_callback=lambda *_: True)
 
     def set_usage(self, producer_id: str, bytes: int) -> None:
-        self._usage[producer_id] = bytes
-
-    def get_object_store_memory_usage(self, producer_id: str) -> int:
-        return self._usage.get(producer_id, 0)
-
-    def clear(self) -> None:
-        self._usage.clear()
+        self._bytes_by_producer[producer_id] = bytes
 
 
 def mock_map_op(