Describe the bug
Traceback (most recent call last):
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/workspace/CM20260505/scoring/run_scoring.py", line 311, in _run_scorer_in_parallel
return _run_scorer_parallelizable(
File "/workspace/CM20260505/scoring/run_scoring.py", line 387, in _run_scorer_parallelizable
scoringResults = scorer.prescore(scoringArgs, preserveRatings=not runParallel)
File "/workspace/CM20260505/scoring/scorer.py", line 293, in prescore
ratings, noteStatusHistory = self._filter_input(
File "/workspace/CM20260505/scoring/scorer.py", line 147, in _filter_input
ratings = ratings.merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/frame.py", line 10832, in merge
return merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 184, in merge
return op.get_result(copy=copy)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 886, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1151, in _get_join_info
(left_indexer, right_indexer) = self._get_join_indexers()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1125, in _get_join_indexers
return get_join_indexers(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1759, in get_join_indexers
lidx, ridx = get_join_indexers_non_unique(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1793, in get_join_indexers_non_unique
lkey, rkey, count = _factorize_keys(left, right, sort=sort)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 2487, in _factorize_keys
.combine_chunks()
File "pyarrow/table.pxi", line 780, in pyarrow.lib.ChunkedArray.combine_chunks
File "pyarrow/array.pxi", line 5098, in pyarrow.lib.concat_arrays
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from string to large_string first.
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/CM20260505/main.py", line 31, in
main()
File "/workspace/CM20260505/scoring/runner.py", line 277, in main
return _run_scorer(args=args, dataLoader=dataLoader, extraScoringArgs=extraScoringArgs)
File "/workspace/CM20260505/scoring/runner.py", line 230, in _run_scorer
scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 2365, in run_scoring
) = run_prescoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 1461, in run_prescoring
prescoringModelResultsFromAllScorers = _run_scorers(
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in _run_scorers
modelResultsAndTimes = [f.result() for f in futures]
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in
modelResultsAndTimes = [f.result() for f in futures]
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from string to large_string first.
To Reproduce
The error can be reproduced by the code in Commit 20260505 and the snapshot with date 2026-05-13
Describe the bug
Traceback (most recent call last):
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/workspace/CM20260505/scoring/run_scoring.py", line 311, in _run_scorer_in_parallel
return _run_scorer_parallelizable(
File "/workspace/CM20260505/scoring/run_scoring.py", line 387, in _run_scorer_parallelizable
scoringResults = scorer.prescore(scoringArgs, preserveRatings=not runParallel)
File "/workspace/CM20260505/scoring/scorer.py", line 293, in prescore
ratings, noteStatusHistory = self._filter_input(
File "/workspace/CM20260505/scoring/scorer.py", line 147, in _filter_input
ratings = ratings.merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/frame.py", line 10832, in merge
return merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 184, in merge
return op.get_result(copy=copy)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 886, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1151, in _get_join_info
(left_indexer, right_indexer) = self._get_join_indexers()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1125, in _get_join_indexers
return get_join_indexers(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1759, in get_join_indexers
lidx, ridx = get_join_indexers_non_unique(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1793, in get_join_indexers_non_unique
lkey, rkey, count = _factorize_keys(left, right, sort=sort)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 2487, in _factorize_keys
.combine_chunks()
File "pyarrow/table.pxi", line 780, in pyarrow.lib.ChunkedArray.combine_chunks
File "pyarrow/array.pxi", line 5098, in pyarrow.lib.concat_arrays
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from
stringtolarge_stringfirst."""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/workspace/CM20260505/main.py", line 31, in
main()
File "/workspace/CM20260505/scoring/runner.py", line 277, in main
return _run_scorer(args=args, dataLoader=dataLoader, extraScoringArgs=extraScoringArgs)
File "/workspace/CM20260505/scoring/runner.py", line 230, in _run_scorer
scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 2365, in run_scoring
) = run_prescoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 1461, in run_prescoring
prescoringModelResultsFromAllScorers = _run_scorers(
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in _run_scorers
modelResultsAndTimes = [f.result() for f in futures]
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in
modelResultsAndTimes = [f.result() for f in futures]
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from
stringtolarge_stringfirst.To Reproduce
The error can be reproduced by the code in Commit 20260505 and the snapshot with date 2026-05-13