From e53de869f81b2da30c94335948a937f2104821ed Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 20 Mar 2026 03:43:58 +0000
Subject: [PATCH] fix: skip binary detection for PDF and DOCX files

PDF and DOCX files are inherently binary (contain null bytes) but have
dedicated parsers. The isBinary() check was rejecting them before the
PDF/DOCX loaders could run, causing indexing to silently skip these
files with only a debug-level log.

Now skips the binary heuristic for known binary formats that have
dedicated loaders (.pdf, .docx).

https://claude.ai/code/session_011Ryet7uu9j6VyzNGmUuaaj
---
 internal/loader/loader.go | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/internal/loader/loader.go b/internal/loader/loader.go
index 229dac1..aacba0a 100644
--- a/internal/loader/loader.go
+++ b/internal/loader/loader.go
@@ -68,18 +68,30 @@ func init() {
 	}
 }
 
+// knownBinaryFormats are file extensions for formats that are inherently binary
+// but have dedicated loaders that know how to parse them.
+var knownBinaryFormats = map[string]bool{
+	".pdf":  true,
+	".docx": true,
+}
+
 // Load dispatches to the correct loader by file extension.
 // Returns ErrBinaryFile (non-fatal) if the file looks like a binary.
 func Load(path string) (*RawDocument, error) {
-	binary, err := isBinary(path)
-	if err != nil {
-		return nil, err
-	}
-	if binary {
-		return nil, fmt.Errorf("%w: %s", ErrBinaryFile, path)
+	ext := strings.ToLower(filepath.Ext(path))
+
+	// Skip binary detection for formats that are inherently binary but have
+	// dedicated parsers (PDF, DOCX).
+	if !knownBinaryFormats[ext] {
+		binary, err := isBinary(path)
+		if err != nil {
+			return nil, err
+		}
+		if binary {
+			return nil, fmt.Errorf("%w: %s", ErrBinaryFile, path)
+		}
 	}
 
-	ext := strings.ToLower(filepath.Ext(path))
 	for _, l := range registry {
 		if l.Supports(ext) {
 			return l.Load(path)