From 4cd80a488c3853e69a5b5fc941f7d97a39854c36 Mon Sep 17 00:00:00 2001
From: xodn348 <xodn348@tamu.edu>
Date: Tue, 5 May 2026 09:34:56 +0000
Subject: [PATCH] fix(bertscore): cap tokenizer model_max_length to prevent
 Rust backend OverflowError

Models such as microsoft/deberta-xlarge-mnli omit model_max_length from
their tokenizer config.  transformers fills the gap with a huge sentinel
(~1e30), which bert_score then passes to the Rust tokenizers backend via
enable_truncation(), causing OverflowError: int too big to convert.

Add an explicit cap in BERTScore._compute(): if the caller supplies
max_length that value is applied directly; otherwise any sentinel larger
than sys.maxsize is clamped to 512.  Both paths are covered by new unit
tests that mock the scorer so no model download is required.

Fixes #739
---
 metrics/bertscore/bertscore.py | 17 ++++++++
 tests/test_metric_common.py    | 76 ++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/metrics/bertscore/bertscore.py b/metrics/bertscore/bertscore.py
index 071e76ff3..cf2490596 100644
--- a/metrics/bertscore/bertscore.py
+++ b/metrics/bertscore/bertscore.py
@@ -14,6 +14,7 @@
 """ BERTScore metric. """
 
 import functools
+import sys
 from contextlib import contextmanager
 
 import bert_score
@@ -79,6 +80,11 @@ def filter_log(record):
     rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline.
     baseline_path (str): Customized baseline file.
     use_fast_tokenizer (bool): `use_fast` parameter passed to HF tokenizer. New in version 0.3.10.
+    max_length (int): Maximum sequence length for tokenization. Useful when the model does not
+        define ``model_max_length`` in its tokenizer config (e.g. DeBERTa variants), which causes
+        transformers to fall back to a huge sentinel value that overflows the Rust tokenizers backend.
+        When not set, the metric automatically caps any sentinel value larger than ``sys.maxsize``
+        to 512 to prevent the ``OverflowError``.
 
 Returns:
     precision: Precision.
@@ -142,6 +148,7 @@ def _compute(
         rescale_with_baseline=False,
         baseline_path=None,
         use_fast_tokenizer=False,
+        max_length=None,
     ):
 
         if isinstance(references[0], str):
@@ -200,6 +207,16 @@ def _compute(
                     baseline_path=baseline_path,
                 )
 
+            # Some models (e.g. DeBERTa) omit model_max_length from their tokenizer config.
+            # Transformers then sets it to a huge sentinel (~1e30) that overflows the Rust
+            # tokenizers backend when passed to enable_truncation().  Cap it here so that
+            # bert_score's internal encode calls stay within a safe integer range.
+            _tokenizer = self.cached_bertscorer._tokenizer
+            if max_length is not None:
+                _tokenizer.model_max_length = max_length
+            elif _tokenizer.model_max_length > sys.maxsize:
+                _tokenizer.model_max_length = 512
+
         (P, R, F) = self.cached_bertscorer.score(
             cands=predictions,
             refs=references,
diff --git a/tests/test_metric_common.py b/tests/test_metric_common.py
index 014dc0b32..8252ab0d8 100644
--- a/tests/test_metric_common.py
+++ b/tests/test_metric_common.py
@@ -219,6 +219,82 @@ def predict(self, data, *args, **kwargs):
             yield
 
 
+def test_bertscore_large_model_max_length_does_not_overflow():
+    """Regression test for https://github.com/huggingface/evaluate/issues/739.
+
+    Models that omit model_max_length from their tokenizer config cause transformers to
+    set it to a huge sentinel (~1e30).  That sentinel overflows the Rust tokenizers backend
+    when bert_score passes it to enable_truncation().  BERTScore._compute() should clamp
+    the value to a safe integer before scoring.
+    """
+    import sys
+    import torch
+
+    VERY_LARGE_INTEGER = int(1e30)
+
+    def bert_cos_score_idf(model, refs, *args, **kwargs):
+        return torch.tensor([[1.0, 1.0, 1.0]] * len(refs))
+
+    class FakeTokenizer:
+        model_max_length = VERY_LARGE_INTEGER
+
+    class FakeScorer:
+        hash = "fakehash"
+        _tokenizer = FakeTokenizer()
+
+        def score(self, cands, refs, **kwargs):
+            return (torch.tensor([1.0]), torch.tensor([1.0]), torch.tensor([1.0]))
+
+    with patch("bert_score.scorer.get_model"), patch(
+        "bert_score.scorer.bert_cos_score_idf"
+    ) as mock_score, patch("bert_score.utils.get_hash", return_value="fakehash"), patch(
+        "bert_score.BERTScorer", return_value=FakeScorer()
+    ):
+        mock_score.side_effect = bert_cos_score_idf
+        metric = load(os.path.join("metrics", "bertscore"))
+        result = metric.compute(
+            predictions=["hello there"],
+            references=["hello there"],
+            lang="en",
+        )
+    # model_max_length must have been capped to a safe value
+    assert FakeScorer._tokenizer.model_max_length <= sys.maxsize
+    assert result["f1"] == [1.0]
+
+
+def test_bertscore_explicit_max_length_is_honoured():
+    """max_length parameter is applied to the tokenizer when provided."""
+    import torch
+
+    def bert_cos_score_idf(model, refs, *args, **kwargs):
+        return torch.tensor([[1.0, 1.0, 1.0]] * len(refs))
+
+    class FakeTokenizer:
+        model_max_length = 512
+
+    class FakeScorer:
+        hash = "fakehash"
+        _tokenizer = FakeTokenizer()
+
+        def score(self, cands, refs, **kwargs):
+            return (torch.tensor([1.0]), torch.tensor([1.0]), torch.tensor([1.0]))
+
+    with patch("bert_score.scorer.get_model"), patch(
+        "bert_score.scorer.bert_cos_score_idf"
+    ) as mock_score, patch("bert_score.utils.get_hash", return_value="fakehash"), patch(
+        "bert_score.BERTScorer", return_value=FakeScorer()
+    ):
+        mock_score.side_effect = bert_cos_score_idf
+        metric = load(os.path.join("metrics", "bertscore"))
+        metric.compute(
+            predictions=["hello there"],
+            references=["hello there"],
+            lang="en",
+            max_length=256,
+        )
+    assert FakeScorer._tokenizer.model_max_length == 256
+
+
 def test_seqeval_raises_when_incorrect_scheme():
     metric = load(os.path.join("metrics", "seqeval"))
     wrong_scheme = "ERROR"