Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 162 additions & 35 deletions engineV4.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import signal
import subprocess
import sys
import tempfile
import threading
import time
from collections import deque
Expand Down Expand Up @@ -352,32 +353,62 @@ def terminate_child(*args):
result_queue.put(("error", slot_index, api_config_str, str(err)))
continue
result_queue.put(("child", slot_index, child_process.pid))
output_lines = deque(maxlen=40)
try:
for line in child_process.stdout:
output_lines.append(line)
print(line, end="", flush=True)
returncode = child_process.wait()
finally:
if child_process.stdout is not None:
child_process.stdout.close()

child_process = None
if returncode in (0, 2):
merge_sanitizer_case_logs(case_log_dir)
shutil.rmtree(case_log_dir, ignore_errors=True)

if returncode == 0:
result_queue.put(("done", slot_index, api_config_str))
elif returncode == 2:
result_queue.put(
("error", slot_index, api_config_str, f"child exited with {returncode}")
)
else:
output_tail = "".join(output_lines)
result_queue.put(
("crashed", slot_index, api_config_str, returncode, output_tail, "child")
)
output_tail = deque(maxlen=40)
with tempfile.TemporaryFile(
mode="w+t", encoding="utf-8", errors="replace"
) as output_file:
try:
for line in child_process.stdout:
output_tail.append(line)
output_file.write(line)
returncode = child_process.wait()
finally:
if child_process.stdout is not None:
child_process.stdout.close()

child_process = None
output_file.seek(0)
if returncode == options.sanitizer_error_exitcode:
analysis = _analyze_sanitizer_output(
output_file.read(), returncode, options.sanitizer_error_exitcode
)
if analysis.filtered_output:
print(
analysis.filtered_output,
end="" if analysis.filtered_output.endswith("\n") else "\n",
flush=True,
)
else:
analysis = SanitizerOutputAnalysis("", False)
shutil.copyfileobj(output_file, sys.stdout)
sys.stdout.flush()

if returncode in (0, 2) or analysis.ignore_error_exitcode:
merge_sanitizer_case_logs(case_log_dir)
shutil.rmtree(case_log_dir, ignore_errors=True)

if returncode == 0 or analysis.ignore_error_exitcode:
result_queue.put(("done", slot_index, api_config_str))
elif returncode == 2:
result_queue.put(
(
"error",
slot_index,
api_config_str,
f"child exited with {returncode}",
)
)
else:
result_queue.put(
(
"crashed",
slot_index,
api_config_str,
returncode,
"".join(output_tail),
"child",
)
)
finally:
if child_process is not None and child_process.poll() is None:
try:
Expand Down Expand Up @@ -966,10 +997,101 @@ def _prepare_single_config_gpu(options):
return gpu_ids[0]


def _filter_sanitizer_output(output, returncode, sanitizer_error_exitcode):
SANITIZER_PREFIX = "========="
SANITIZER_CUDA_API_ERROR = "CUDA API Error:"
SANITIZER_PROGRAM_HIT = "Program hit"
SANITIZER_ERROR_SUMMARY = "ERROR SUMMARY:"
CUDA_VERSION_ERROR_RE = re.compile(
r"cudaVersion argument \(\d+\) exceeds the driver version \(\d+\)"
)


@dataclass(frozen=True)
class SanitizerOutputAnalysis:
filtered_output: str
ignore_error_exitcode: bool


def _is_sanitizer_block_boundary(line):
return line.startswith(SANITIZER_PREFIX) and (
SANITIZER_CUDA_API_ERROR in line
or SANITIZER_PROGRAM_HIT in line
or SANITIZER_ERROR_SUMMARY in line
)


def _is_cuda_version_error_block(block_lines):
first_line = block_lines[0] if block_lines else ""
return (
SANITIZER_CUDA_API_ERROR in first_line
and CUDA_VERSION_ERROR_RE.search("\n".join(block_lines)) is not None
)


def _is_cu_get_proc_address_invalid_value_block(block_lines):
first_line = block_lines[0] if block_lines else ""
return "CUDA_ERROR_INVALID_VALUE" in first_line and "cuGetProcAddress_v2" in first_line


def _analyze_sanitizer_output(output, returncode, sanitizer_error_exitcode):
if returncode != sanitizer_error_exitcode:
return output
return "\n".join(line for line in output.splitlines() if not line.startswith("[Pass]"))
return SanitizerOutputAnalysis(output, False)

lines = output.splitlines()
filtered_lines = []
ignored_line_indices = set()
ignored_any = False
saw_cuda_version_error = False
kept_sanitizer_error = False
index = 0

while index < len(lines):
line = lines[index]
if not _is_sanitizer_block_boundary(line):
index += 1
continue

block_end = index + 1
while block_end < len(lines) and not _is_sanitizer_block_boundary(lines[block_end]):
block_end += 1

block_lines = lines[index:block_end]
sanitizer_line_indices = [
line_index
for line_index in range(index, block_end)
if lines[line_index].startswith(SANITIZER_PREFIX)
]
if _is_cuda_version_error_block(block_lines):
ignored_line_indices.update(sanitizer_line_indices)
ignored_any = True
saw_cuda_version_error = True
elif _is_cu_get_proc_address_invalid_value_block(block_lines):
ignored_line_indices.update(sanitizer_line_indices)
ignored_any = True
elif SANITIZER_PROGRAM_HIT in line or SANITIZER_CUDA_API_ERROR in line:
kept_sanitizer_error = True

index = block_end

ignore_error_exitcode = ignored_any and saw_cuda_version_error and not kept_sanitizer_error
for index, line in enumerate(lines):
if index in ignored_line_indices:
continue
if ignore_error_exitcode and SANITIZER_ERROR_SUMMARY in line:
continue
filtered_lines.append(line)

return SanitizerOutputAnalysis("\n".join(filtered_lines), ignore_error_exitcode)


def _is_ignored_cuda_version_sanitizer_error(output, returncode, sanitizer_error_exitcode):
return _analyze_sanitizer_output(
output, returncode, sanitizer_error_exitcode
).ignore_error_exitcode


def _filter_sanitizer_output(output, returncode, sanitizer_error_exitcode):
return _analyze_sanitizer_output(output, returncode, sanitizer_error_exitcode).filtered_output


def _validate_sanitizer_command(command):
Expand Down Expand Up @@ -1019,13 +1141,18 @@ def _run_single_config_with_sanitizer(options):
encoding="utf-8",
errors="replace",
)
output = _filter_sanitizer_output(
f"{result.stdout or ''}{result.stderr or ''}",
result.returncode,
options.sanitizer_error_exitcode,
raw_output = f"{result.stdout or ''}{result.stderr or ''}"
analysis = _analyze_sanitizer_output(
raw_output, result.returncode, options.sanitizer_error_exitcode
)
if output:
print(output, end="" if output.endswith("\n") else "\n", flush=True)
if analysis.filtered_output:
print(
analysis.filtered_output,
end="" if analysis.filtered_output.endswith("\n") else "\n",
flush=True,
)
if analysis.ignore_error_exitcode:
return 0
if result.returncode == options.sanitizer_error_exitcode:
print(
f"[error] compute-sanitizer reported errors for {api_config} (exit={result.returncode})",
Expand Down
Loading