diff --git a/.github/workflows/build-bundles.yml b/.github/workflows/build-bundles.yml
index d8df4f7..ba5f48c 100644
--- a/.github/workflows/build-bundles.yml
+++ b/.github/workflows/build-bundles.yml
@@ -134,7 +134,7 @@ jobs:
run: |
set -eux
if [ "${{ matrix.edition }}" = "heavy" ]; then
- /opt/rtrace/python/bin/python3 -c "import angr, capstone; print('OK: heavy deps import')"
+ /opt/rtrace/python/bin/python3 -c "import angr; print('OK: heavy deps import')"
else
# light must refuse rich mode with a clear error instead of crashing later
if rtrace --logdir /tmp/refused --mode rich -- /bin/true; then
diff --git a/DISTRIBUTION.md b/DISTRIBUTION.md
index c3e2a78..af8e2ac 100644
--- a/DISTRIBUTION.md
+++ b/DISTRIBUTION.md
@@ -54,23 +54,23 @@ Python dependencies.
| Native tracer (DynamoRIO + `librtrace.so`) | yes | yes |
| Boundary detection: linear + nucleus + FunSeeker(.NET) | yes | yes |
| Python base: pyelftools, pandas | yes | yes |
-| **angr + capstone** (rich-mode prototype analysis) | no | yes |
+| **angr** (rich-mode prototype analysis) | no | yes |
| `--mode rich` | error → "install rtrace-heavy" | works |
-The only weight difference is **angr** (with z3/unicorn/pyvex/claripy); capstone is
-a small tag-along that is also only needed in rich mode.
+The only weight difference is **angr** (with z3/unicorn/pyvex/claripy), needed only
+in rich mode. (`libcapstone` the C library still ships in both editions — nucleus
+links it — but the Python `capstone` binding is no longer used.)
Mode gating evidence:
- `angr`: `_set_function_prototype`, gated by `analyze_function_prototypes=(mode==MODE_RICH)`.
-- `capstone`: `Library.decode()`, only called in rich mode.
## Phased plan
### Phase 1 — Refactor + packaging (foundation) — IN PROGRESS
1. `pyproject.toml` with the `rtrace` package + `rtrace` console-script entry point.
2. **Lazy-import** the heavy deps so the light install does not require them:
- `angr`, `capstone` (and `nucleus`, which ships in both).
-3. Dependency extras: base = light; `[heavy]` extra = `angr, capstone`.
+ `angr` (and `nucleus`, which ships in both).
+3. Dependency extras: base = light; `[heavy]` extra = `angr`.
4. `--mode rich` guard in the light edition with a clear "install rtrace-heavy" message.
5. `RTRACE_HOME`-relative path resolution (removes the hardcoded `/home/ubuntu/...`
paths and the conda shebangs).
diff --git a/docs/packaging.md b/docs/packaging.md
index 8b1f4e8..db347ea 100644
--- a/docs/packaging.md
+++ b/docs/packaging.md
@@ -11,7 +11,7 @@ Two self-contained tarballs, both Linux / x86-64:
| Tarball | Edition | Modes | Extra deps | ~Size |
|---|---|---|---|---|
| `rtrace-light-linux-x64.tar.gz` | light | light | — | 120 MB |
-| `rtrace-heavy-linux-x64.tar.gz` | heavy | light + rich | angr, capstone | 250 MB |
+| `rtrace-heavy-linux-x64.tar.gz` | heavy | light + rich | angr | 250 MB |
Each unpacks to an `rtrace/` directory (the `RTRACE_HOME` bundle):
@@ -63,8 +63,8 @@ container — the oldest supported distro, so it also guards the glibc floor —
(no build tools, no libicu, no binutils) and proves it is genuinely
self-contained: a real `--mode light` trace of `/bin/ls` (exercising drrun,
librtrace.so, postprocessing, and FunSeeker boundary detection on the stripped
-CET system libraries), a nucleus boundary-detection call, heavy-edition imports
-(`angr`/`capstone`), and the light edition's `--mode rich` refusal.
+CET system libraries), a nucleus boundary-detection call, the heavy-edition
+`angr` import, and the light edition's `--mode rich` refusal.
A full rich-mode run is deliberately excluded: it runs angr prototype analysis
over every loaded module (libc included), far too slow for CI.
diff --git a/packaging/build-bundle.sh b/packaging/build-bundle.sh
index e4f5cab..1dc51f1 100755
--- a/packaging/build-bundle.sh
+++ b/packaging/build-bundle.sh
@@ -12,7 +12,7 @@
#
# Editions (see DISTRIBUTION.md):
# light -- installs `rtrace` (mode 1 only)
-# heavy -- installs `rtrace[heavy]` (modes 0 + 1; adds angr/capstone)
+# heavy -- installs `rtrace[heavy]` (modes 0 + 1; adds angr)
#
# Usage:
# packaging/build-bundle.sh --prefix
--edition light|heavy \
diff --git a/pyproject.toml b/pyproject.toml
index 1c9d094..80cfdf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,6 @@ heavy = [
# angr 9.2.102 assigns CLexer.filename, which pycparser >= 2.23 turned into a
# read-only property; without this pin `import angr` crashes.
"pycparser==2.22",
- "capstone==5.0.0.post1",
]
# Development tools (lint, format, tests).
dev = [
diff --git a/requirements.txt b/requirements.txt
index ff8b776..4c70297 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
angr==9.2.102
pyelftools==0.32
pybind11==2.13.6
-capstone==5.0.0.post1
numpy==2.0.2
pandas==2.3.0
pyghidra==2.2.0
diff --git a/src/rtrace/disassembler.py b/src/rtrace/disassembler.py
deleted file mode 100644
index 9654ab4..0000000
--- a/src/rtrace/disassembler.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""Capstone-backed disassembly.
-
-``capstone`` is a heavy-edition (mode 0) dependency, so it is imported lazily: the
-disassembler is constructed on first use rather than at import time. This lets the
-light edition import this module without ``capstone`` installed.
-"""
-
-_DISASSEMBLER = None
-
-
-def _get_disassembler():
- global _DISASSEMBLER
- if _DISASSEMBLER is None:
- from capstone import CS_ARCH_X86, CS_MODE_64, Cs
-
- disassembler = Cs(CS_ARCH_X86, CS_MODE_64)
- disassembler.detail = True
- disassembler.skipdata = True
- _DISASSEMBLER = disassembler
- return _DISASSEMBLER
-
-
-def disassemble_data(data, base):
- insns = []
- for insn in _get_disassembler().disasm(data, base):
- insns.append(insn)
- return insns
diff --git a/src/rtrace/library.py b/src/rtrace/library.py
index 2f7b649..126b73b 100644
--- a/src/rtrace/library.py
+++ b/src/rtrace/library.py
@@ -6,80 +6,16 @@
from elftools.elf.elffile import ELFFile, SymbolTableSection
from . import paths
-
-# capstone is a heavy-edition (mode 0) dependency. It is only referenced by the
-# Instruction class methods, which run exclusively in mode 0, so guard the import
-# to let the light edition load this module without capstone installed.
-try:
- from capstone import CS_GRP_CALL, CS_GRP_JUMP, CS_GRP_RET
- from capstone.x86_const import (
- X86_INS_ENDBR32,
- X86_INS_ENDBR64,
- X86_INS_NOP,
- X86_OP_MEM,
- X86_OP_REG,
- )
-except ImportError:
- pass
-
from .boundary_detection import (
boundary_detection_funseeker,
boundary_detection_linear,
boundary_detection_nucleus,
)
-from .disassembler import disassemble_data
from .utils import is_func_symbol
logger = logging.getLogger(__name__)
-class Instruction(object):
- def __init__(self, insn, section_name=None, next=None, so_path=None):
- self.insn = insn # original instruction object from capstone
- self.section_name = section_name
- self.address = insn.address
- self.next = next
- self.prev = None
- self.so_path = so_path # path to the shared object file, if applicable
-
- def __repr__(self):
- return (
- f"{self.so_path}:{self.section_name}:{hex(self.address)} "
- f"{self.insn.mnemonic} {self.insn.op_str}"
- )
-
- def is_endbr(self):
- return self.insn.id in (X86_INS_ENDBR64, X86_INS_ENDBR32)
-
- def is_call(self):
- return CS_GRP_CALL in self.insn.groups
-
- def is_jmp(self):
- return CS_GRP_JUMP in self.insn.groups
-
- def is_nop(self):
- return self.insn.id == X86_INS_NOP
-
- def is_ret(self):
- return CS_GRP_RET in self.insn.groups
-
- def is_indirect_call(self):
- is_call = CS_GRP_CALL in self.insn.groups
- if is_call:
- op = self.insn.operands[0]
- return op.type in (X86_OP_MEM, X86_OP_REG)
- else:
- return False
-
- def is_indirect_jmp(self):
- is_jmp = CS_GRP_JUMP in self.insn.groups
- if is_jmp:
- op = self.insn.operands[0]
- return op.type in (X86_OP_MEM, X86_OP_REG)
- else:
- return False
-
-
class Function(object):
def __init__(self, start, end, name, so_path):
self.start = start
@@ -109,8 +45,6 @@ def __init__(
# open for the lifetime of the Library; call close() when done.
self._file = open(so_path, "rb")
self._elffile = ELFFile(self._file)
- self._instructions = []
- self._addr_to_instruction = {}
self._functions = []
self.boundary_detection_method = boundary_detection_method
self.debug_sym_file = debug_sym_file
@@ -155,13 +89,6 @@ def close(self):
"""Close the underlying ELF file handle."""
self._file.close()
- def _list_executable_sections(self):
- sections = []
- for section in self._elffile.iter_sections():
- if section["sh_flags"] & 0x4:
- sections.append(section.name)
- return sections
-
def _has_symtab(self):
return self._elffile.get_section_by_name(".symtab") is not None
@@ -347,64 +274,6 @@ def _create_functions(self):
for f in self._functions:
self._addr_to_function[f.start] = f
- def decode(self):
- executable_sections = self._list_executable_sections()
- for section_name in executable_sections:
- section_data = self._elffile.get_section_by_name(section_name).data()
- section_base_address = self._elffile.get_section_by_name(section_name)["sh_addr"]
- instructions = disassemble_data(section_data, section_base_address)
- prev_insn = None
- for insn in instructions:
- instruction = Instruction(insn, section_name, so_path=self.so_path)
- instruction.prev = prev_insn
- self._instructions.append(instruction)
- self._addr_to_instruction[insn.address] = instruction
- if prev_insn is not None:
- prev_insn.next = instruction
- prev_insn = instruction
-
- def dump(self, output_file=None):
- if output_file is None:
- output_file = os.path.basename(self.so_path) + ".disasm"
- with open(output_file, "w") as f:
- executable_sections = self._list_executable_sections()
- for section_name in executable_sections:
- f.write(f"Section: {section_name}\n")
-
- for insn in self._instructions:
- f.write(
- f"{insn.address:#x} {insn.insn.mnemonic} "
- f"{insn.insn.op_str} {insn.section_name}\n"
- )
-
- def get_instruction_at_address(self, address):
- if address in self._addr_to_instruction:
- return self._addr_to_instruction[address]
- else:
- logger.warning(
- "Address not found in cached instructions, disassembling on-the-fly: %#x.",
- address,
- )
- # find which section the address belongs to
- for section_name in self._list_executable_sections():
- section = self._elffile.get_section_by_name(section_name)
- section_base_address = section["sh_addr"]
- section_size = section["sh_size"]
- if section_base_address <= address < section_base_address + section_size:
- section_data = section.data()
- offset_in_section = address - section_base_address
- if offset_in_section < len(section_data):
- insn = disassemble_data(
- section_data[offset_in_section : offset_in_section + 16],
- section_base_address + offset_in_section,
- )
- if insn:
- decoded_insn = Instruction(insn[0], section_name, so_path=self.so_path)
- self._addr_to_instruction[address] = decoded_insn
- self._instructions.append(decoded_insn)
- return decoded_insn
- raise ValueError(f"Cannot find instruction at address {address:#x} in {self.so_path}")
-
def _get_function_ind_at_address(self, address):
# binary search for the function
# not the exact address, but within the function range
diff --git a/src/rtrace/postprocess.py b/src/rtrace/postprocess.py
index 537e115..1a82d10 100755
--- a/src/rtrace/postprocess.py
+++ b/src/rtrace/postprocess.py
@@ -113,7 +113,6 @@ def get_func_arg_ret(pid, tid, input_dir):
pid,
tids,
input_dir,
- mode=mode,
bd_algo=bd_algo,
bd_cache_dir=bd_cache_dir,
analyze_function_prototypes=(mode == MODE_RICH),
diff --git a/src/rtrace/process.py b/src/rtrace/process.py
index 7b36916..e1c51e0 100644
--- a/src/rtrace/process.py
+++ b/src/rtrace/process.py
@@ -14,7 +14,6 @@ def __init__(
path,
start,
end,
- mode=0,
bd_algo=None,
bd_cache_dir=None,
analyze_function_prototypes=False,
@@ -28,17 +27,10 @@ def __init__(
func_info_dir=bd_cache_dir,
analyze_function_prototypes=analyze_function_prototypes,
)
- if mode == 0:
- self.lib.decode()
def is_in(self, addr):
return self.start <= addr < self.end
- def get_instruction_at_address(self, address):
- """Get instruction at a specific address within the module."""
- addr_in_module = address - self.start
- return self.lib.get_instruction_at_address(addr_in_module)
-
def get_function_at_address(self, address):
"""Get function at a specific address within the module."""
addr_in_module = address - self.start
@@ -67,7 +59,7 @@ def deduplicate_modules(modules):
def get_loaded_module(
- pid, tids, input_dir, mode=0, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False
+ pid, tids, input_dir, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False
):
# first try to read the corresponding pid-tid file,
# if it is empty, try to read another pid-tid' file
@@ -96,7 +88,6 @@ def read_module_info(file_path):
so_path,
start,
end,
- mode=mode,
bd_algo=bd_algo,
bd_cache_dir=bd_cache_dir,
analyze_function_prototypes=analyze_function_prototypes,
@@ -133,7 +124,6 @@ def __init__(
pid,
tids,
log_dir,
- mode=0,
bd_algo=None,
bd_cache_dir=None,
analyze_function_prototypes=False,
@@ -145,7 +135,6 @@ def __init__(
pid,
tids,
log_dir,
- mode=mode,
bd_algo=bd_algo,
bd_cache_dir=bd_cache_dir,
analyze_function_prototypes=analyze_function_prototypes,