runner fix to mitigate the numerical issue (pytorch#19286)

billmguo · meta-codesync[bot] · commit a45dfd3e6623 · 2026-05-05T11:22:55.000-07:00
Summary: Pull Request resolved: pytorch#19286 Fix 1 — Dangling shared_ptr (2 files) - runner/static_transformer_runner.h:33 - runner/experimental/static_transformer_runner.h:33 Changed module_(std::shared_ptr<Module>(module.get())) to module_(std::move(module)). The old code extracted the raw pointer without releasing ownership, so the unique_ptr destructor would free the Module while the shared_ptr member still pointed to it. Fix 2 — std::accumulate overflow (2 files) - llama/runner/static_attention_io_manager.h:58 - runner/experimental/static_attention_io_manager.h:59 Changed std::accumulate(..., 0) to std::accumulate(..., size_t(0)). The int initial value caused the entire accumulation to happen in 32-bit signed arithmetic before assigning to size_t. Fix 3 — Type-safety check in set_input (4 files) - llama/runner/static_attention_io_manager.h — added include + size check - runner/experimental/static_attention_io_manager.h — added include + size check - runner/static_transformer_runner.h — added size check (include inherited) - runner/experimental/static_transformer_runner.h — added size check (include inherited) Added ET_CHECK_MSG(sizeof(T) == elementSize(inputMeta->scalar_type()), ...) before constructing the TensorImpl. This catches mismatches between the runner's compiled types (CacheT, MaskT, RopeT) and the model's actual tensor dtypes at load time, rather than silently reinterpreting data. Reviewed By: viveknayakatmeta Differential Revision: D103690468
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
@@ -54,7 +55,7 @@ class StaticKVCache {
         input_ptrs_(n_caches_),
         output_ptrs_(n_caches_) {
     size_t total_cache_len =
-        std::accumulate(cache_lengths_.begin(), cache_lengths_.end(), 0);
+        std::accumulate(cache_lengths_.begin(), cache_lengths_.end(), size_t(0));
     cache_data_size_ = total_cache_len * n_heads_per_cache_ * head_dim_;
     update_data_size_ =
         n_caches_ * n_heads_per_cache_ * max_input_len_ * head_dim_;
@@ -867,6 +868,13 @@ class StaticAttentionIOManager {
   void set_input(executorch::runtime::Method& method, size_t idx, T* data) {
     auto methodMeta = method.method_meta();
     auto inputMeta = methodMeta.input_tensor_meta(idx);
+    ET_CHECK_MSG(
+        sizeof(T) ==
+            executorch::runtime::elementSize(inputMeta->scalar_type()),
+        "set_input: sizeof(T)=%zu but model expects element size %zu for input %zu",
+        sizeof(T),
+        executorch::runtime::elementSize(inputMeta->scalar_type()),
+        idx);
     auto impl = ::executorch::runtime::etensor::TensorImpl(
         inputMeta->scalar_type(),
         inputMeta->sizes().size(),