From e53de869f81b2da30c94335948a937f2104821ed Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Mar 2026 03:43:58 +0000 Subject: [PATCH] fix: skip binary detection for PDF and DOCX files PDF and DOCX files are inherently binary (contain null bytes) but have dedicated parsers. The isBinary() check was rejecting them before the PDF/DOCX loaders could run, causing indexing to silently skip these files with only a debug-level log. Now skips the binary heuristic for known binary formats that have dedicated loaders (.pdf, .docx). https://claude.ai/code/session_011Ryet7uu9j6VyzNGmUuaaj --- internal/loader/loader.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/internal/loader/loader.go b/internal/loader/loader.go index 229dac1..aacba0a 100644 --- a/internal/loader/loader.go +++ b/internal/loader/loader.go @@ -68,18 +68,30 @@ func init() { } } +// knownBinaryFormats are file extensions for formats that are inherently binary +// but have dedicated loaders that know how to parse them. +var knownBinaryFormats = map[string]bool{ + ".pdf": true, + ".docx": true, +} + // Load dispatches to the correct loader by file extension. // Returns ErrBinaryFile (non-fatal) if the file looks like a binary. func Load(path string) (*RawDocument, error) { - binary, err := isBinary(path) - if err != nil { - return nil, err - } - if binary { - return nil, fmt.Errorf("%w: %s", ErrBinaryFile, path) + ext := strings.ToLower(filepath.Ext(path)) + + // Skip binary detection for formats that are inherently binary but have + // dedicated parsers (PDF, DOCX). + if !knownBinaryFormats[ext] { + binary, err := isBinary(path) + if err != nil { + return nil, err + } + if binary { + return nil, fmt.Errorf("%w: %s", ErrBinaryFile, path) + } } - ext := strings.ToLower(filepath.Ext(path)) for _, l := range registry { if l.Supports(ext) { return l.Load(path)