diff --git a/.github/workflows/build-bundles.yml b/.github/workflows/build-bundles.yml index d8df4f7..ba5f48c 100644 --- a/.github/workflows/build-bundles.yml +++ b/.github/workflows/build-bundles.yml @@ -134,7 +134,7 @@ jobs: run: | set -eux if [ "${{ matrix.edition }}" = "heavy" ]; then - /opt/rtrace/python/bin/python3 -c "import angr, capstone; print('OK: heavy deps import')" + /opt/rtrace/python/bin/python3 -c "import angr; print('OK: heavy deps import')" else # light must refuse rich mode with a clear error instead of crashing later if rtrace --logdir /tmp/refused --mode rich -- /bin/true; then diff --git a/DISTRIBUTION.md b/DISTRIBUTION.md index c3e2a78..af8e2ac 100644 --- a/DISTRIBUTION.md +++ b/DISTRIBUTION.md @@ -54,23 +54,23 @@ Python dependencies. | Native tracer (DynamoRIO + `librtrace.so`) | yes | yes | | Boundary detection: linear + nucleus + FunSeeker(.NET) | yes | yes | | Python base: pyelftools, pandas | yes | yes | -| **angr + capstone** (rich-mode prototype analysis) | no | yes | +| **angr** (rich-mode prototype analysis) | no | yes | | `--mode rich` | error → "install rtrace-heavy" | works | -The only weight difference is **angr** (with z3/unicorn/pyvex/claripy); capstone is -a small tag-along that is also only needed in rich mode. +The only weight difference is **angr** (with z3/unicorn/pyvex/claripy), needed only +in rich mode. (`libcapstone` the C library still ships in both editions — nucleus +links it — but the Python `capstone` binding is no longer used.) Mode gating evidence: - `angr`: `_set_function_prototype`, gated by `analyze_function_prototypes=(mode==MODE_RICH)`. -- `capstone`: `Library.decode()`, only called in rich mode. ## Phased plan ### Phase 1 — Refactor + packaging (foundation) — IN PROGRESS 1. `pyproject.toml` with the `rtrace` package + `rtrace` console-script entry point. 2. **Lazy-import** the heavy deps so the light install does not require them: - `angr`, `capstone` (and `nucleus`, which ships in both). -3. Dependency extras: base = light; `[heavy]` extra = `angr, capstone`. + `angr` (and `nucleus`, which ships in both). +3. Dependency extras: base = light; `[heavy]` extra = `angr`. 4. `--mode rich` guard in the light edition with a clear "install rtrace-heavy" message. 5. `RTRACE_HOME`-relative path resolution (removes the hardcoded `/home/ubuntu/...` paths and the conda shebangs). diff --git a/docs/packaging.md b/docs/packaging.md index 8b1f4e8..db347ea 100644 --- a/docs/packaging.md +++ b/docs/packaging.md @@ -11,7 +11,7 @@ Two self-contained tarballs, both Linux / x86-64: | Tarball | Edition | Modes | Extra deps | ~Size | |---|---|---|---|---| | `rtrace-light-linux-x64.tar.gz` | light | light | — | 120 MB | -| `rtrace-heavy-linux-x64.tar.gz` | heavy | light + rich | angr, capstone | 250 MB | +| `rtrace-heavy-linux-x64.tar.gz` | heavy | light + rich | angr | 250 MB | Each unpacks to an `rtrace/` directory (the `RTRACE_HOME` bundle): @@ -63,8 +63,8 @@ container — the oldest supported distro, so it also guards the glibc floor — (no build tools, no libicu, no binutils) and proves it is genuinely self-contained: a real `--mode light` trace of `/bin/ls` (exercising drrun, librtrace.so, postprocessing, and FunSeeker boundary detection on the stripped -CET system libraries), a nucleus boundary-detection call, heavy-edition imports -(`angr`/`capstone`), and the light edition's `--mode rich` refusal. +CET system libraries), a nucleus boundary-detection call, the heavy-edition +`angr` import, and the light edition's `--mode rich` refusal. A full rich-mode run is deliberately excluded: it runs angr prototype analysis over every loaded module (libc included), far too slow for CI. diff --git a/packaging/build-bundle.sh b/packaging/build-bundle.sh index e4f5cab..1dc51f1 100755 --- a/packaging/build-bundle.sh +++ b/packaging/build-bundle.sh @@ -12,7 +12,7 @@ # # Editions (see DISTRIBUTION.md): # light -- installs `rtrace` (mode 1 only) -# heavy -- installs `rtrace[heavy]` (modes 0 + 1; adds angr/capstone) +# heavy -- installs `rtrace[heavy]` (modes 0 + 1; adds angr) # # Usage: # packaging/build-bundle.sh --prefix --edition light|heavy \ diff --git a/pyproject.toml b/pyproject.toml index 1c9d094..80cfdf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ heavy = [ # angr 9.2.102 assigns CLexer.filename, which pycparser >= 2.23 turned into a # read-only property; without this pin `import angr` crashes. "pycparser==2.22", - "capstone==5.0.0.post1", ] # Development tools (lint, format, tests). dev = [ diff --git a/requirements.txt b/requirements.txt index ff8b776..4c70297 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ angr==9.2.102 pyelftools==0.32 pybind11==2.13.6 -capstone==5.0.0.post1 numpy==2.0.2 pandas==2.3.0 pyghidra==2.2.0 diff --git a/src/rtrace/disassembler.py b/src/rtrace/disassembler.py deleted file mode 100644 index 9654ab4..0000000 --- a/src/rtrace/disassembler.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Capstone-backed disassembly. - -``capstone`` is a heavy-edition (mode 0) dependency, so it is imported lazily: the -disassembler is constructed on first use rather than at import time. This lets the -light edition import this module without ``capstone`` installed. -""" - -_DISASSEMBLER = None - - -def _get_disassembler(): - global _DISASSEMBLER - if _DISASSEMBLER is None: - from capstone import CS_ARCH_X86, CS_MODE_64, Cs - - disassembler = Cs(CS_ARCH_X86, CS_MODE_64) - disassembler.detail = True - disassembler.skipdata = True - _DISASSEMBLER = disassembler - return _DISASSEMBLER - - -def disassemble_data(data, base): - insns = [] - for insn in _get_disassembler().disasm(data, base): - insns.append(insn) - return insns diff --git a/src/rtrace/library.py b/src/rtrace/library.py index 2f7b649..126b73b 100644 --- a/src/rtrace/library.py +++ b/src/rtrace/library.py @@ -6,80 +6,16 @@ from elftools.elf.elffile import ELFFile, SymbolTableSection from . import paths - -# capstone is a heavy-edition (mode 0) dependency. It is only referenced by the -# Instruction class methods, which run exclusively in mode 0, so guard the import -# to let the light edition load this module without capstone installed. -try: - from capstone import CS_GRP_CALL, CS_GRP_JUMP, CS_GRP_RET - from capstone.x86_const import ( - X86_INS_ENDBR32, - X86_INS_ENDBR64, - X86_INS_NOP, - X86_OP_MEM, - X86_OP_REG, - ) -except ImportError: - pass - from .boundary_detection import ( boundary_detection_funseeker, boundary_detection_linear, boundary_detection_nucleus, ) -from .disassembler import disassemble_data from .utils import is_func_symbol logger = logging.getLogger(__name__) -class Instruction(object): - def __init__(self, insn, section_name=None, next=None, so_path=None): - self.insn = insn # original instruction object from capstone - self.section_name = section_name - self.address = insn.address - self.next = next - self.prev = None - self.so_path = so_path # path to the shared object file, if applicable - - def __repr__(self): - return ( - f"{self.so_path}:{self.section_name}:{hex(self.address)} " - f"{self.insn.mnemonic} {self.insn.op_str}" - ) - - def is_endbr(self): - return self.insn.id in (X86_INS_ENDBR64, X86_INS_ENDBR32) - - def is_call(self): - return CS_GRP_CALL in self.insn.groups - - def is_jmp(self): - return CS_GRP_JUMP in self.insn.groups - - def is_nop(self): - return self.insn.id == X86_INS_NOP - - def is_ret(self): - return CS_GRP_RET in self.insn.groups - - def is_indirect_call(self): - is_call = CS_GRP_CALL in self.insn.groups - if is_call: - op = self.insn.operands[0] - return op.type in (X86_OP_MEM, X86_OP_REG) - else: - return False - - def is_indirect_jmp(self): - is_jmp = CS_GRP_JUMP in self.insn.groups - if is_jmp: - op = self.insn.operands[0] - return op.type in (X86_OP_MEM, X86_OP_REG) - else: - return False - - class Function(object): def __init__(self, start, end, name, so_path): self.start = start @@ -109,8 +45,6 @@ def __init__( # open for the lifetime of the Library; call close() when done. self._file = open(so_path, "rb") self._elffile = ELFFile(self._file) - self._instructions = [] - self._addr_to_instruction = {} self._functions = [] self.boundary_detection_method = boundary_detection_method self.debug_sym_file = debug_sym_file @@ -155,13 +89,6 @@ def close(self): """Close the underlying ELF file handle.""" self._file.close() - def _list_executable_sections(self): - sections = [] - for section in self._elffile.iter_sections(): - if section["sh_flags"] & 0x4: - sections.append(section.name) - return sections - def _has_symtab(self): return self._elffile.get_section_by_name(".symtab") is not None @@ -347,64 +274,6 @@ def _create_functions(self): for f in self._functions: self._addr_to_function[f.start] = f - def decode(self): - executable_sections = self._list_executable_sections() - for section_name in executable_sections: - section_data = self._elffile.get_section_by_name(section_name).data() - section_base_address = self._elffile.get_section_by_name(section_name)["sh_addr"] - instructions = disassemble_data(section_data, section_base_address) - prev_insn = None - for insn in instructions: - instruction = Instruction(insn, section_name, so_path=self.so_path) - instruction.prev = prev_insn - self._instructions.append(instruction) - self._addr_to_instruction[insn.address] = instruction - if prev_insn is not None: - prev_insn.next = instruction - prev_insn = instruction - - def dump(self, output_file=None): - if output_file is None: - output_file = os.path.basename(self.so_path) + ".disasm" - with open(output_file, "w") as f: - executable_sections = self._list_executable_sections() - for section_name in executable_sections: - f.write(f"Section: {section_name}\n") - - for insn in self._instructions: - f.write( - f"{insn.address:#x} {insn.insn.mnemonic} " - f"{insn.insn.op_str} {insn.section_name}\n" - ) - - def get_instruction_at_address(self, address): - if address in self._addr_to_instruction: - return self._addr_to_instruction[address] - else: - logger.warning( - "Address not found in cached instructions, disassembling on-the-fly: %#x.", - address, - ) - # find which section the address belongs to - for section_name in self._list_executable_sections(): - section = self._elffile.get_section_by_name(section_name) - section_base_address = section["sh_addr"] - section_size = section["sh_size"] - if section_base_address <= address < section_base_address + section_size: - section_data = section.data() - offset_in_section = address - section_base_address - if offset_in_section < len(section_data): - insn = disassemble_data( - section_data[offset_in_section : offset_in_section + 16], - section_base_address + offset_in_section, - ) - if insn: - decoded_insn = Instruction(insn[0], section_name, so_path=self.so_path) - self._addr_to_instruction[address] = decoded_insn - self._instructions.append(decoded_insn) - return decoded_insn - raise ValueError(f"Cannot find instruction at address {address:#x} in {self.so_path}") - def _get_function_ind_at_address(self, address): # binary search for the function # not the exact address, but within the function range diff --git a/src/rtrace/postprocess.py b/src/rtrace/postprocess.py index 537e115..1a82d10 100755 --- a/src/rtrace/postprocess.py +++ b/src/rtrace/postprocess.py @@ -113,7 +113,6 @@ def get_func_arg_ret(pid, tid, input_dir): pid, tids, input_dir, - mode=mode, bd_algo=bd_algo, bd_cache_dir=bd_cache_dir, analyze_function_prototypes=(mode == MODE_RICH), diff --git a/src/rtrace/process.py b/src/rtrace/process.py index 7b36916..e1c51e0 100644 --- a/src/rtrace/process.py +++ b/src/rtrace/process.py @@ -14,7 +14,6 @@ def __init__( path, start, end, - mode=0, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False, @@ -28,17 +27,10 @@ def __init__( func_info_dir=bd_cache_dir, analyze_function_prototypes=analyze_function_prototypes, ) - if mode == 0: - self.lib.decode() def is_in(self, addr): return self.start <= addr < self.end - def get_instruction_at_address(self, address): - """Get instruction at a specific address within the module.""" - addr_in_module = address - self.start - return self.lib.get_instruction_at_address(addr_in_module) - def get_function_at_address(self, address): """Get function at a specific address within the module.""" addr_in_module = address - self.start @@ -67,7 +59,7 @@ def deduplicate_modules(modules): def get_loaded_module( - pid, tids, input_dir, mode=0, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False + pid, tids, input_dir, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False ): # first try to read the corresponding pid-tid file, # if it is empty, try to read another pid-tid' file @@ -96,7 +88,6 @@ def read_module_info(file_path): so_path, start, end, - mode=mode, bd_algo=bd_algo, bd_cache_dir=bd_cache_dir, analyze_function_prototypes=analyze_function_prototypes, @@ -133,7 +124,6 @@ def __init__( pid, tids, log_dir, - mode=0, bd_algo=None, bd_cache_dir=None, analyze_function_prototypes=False, @@ -145,7 +135,6 @@ def __init__( pid, tids, log_dir, - mode=mode, bd_algo=bd_algo, bd_cache_dir=bd_cache_dir, analyze_function_prototypes=analyze_function_prototypes,