diff --git a/go/internal/graph/bulk.go b/go/internal/graph/bulk.go index dd7a6ec5..7de54114 100644 --- a/go/internal/graph/bulk.go +++ b/go/internal/graph/bulk.go @@ -73,7 +73,12 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error { // Cleanup runs whether COPY succeeds or fails. defer os.Remove(tmp.Name()) + // Use pipe '|' as the field delimiter so that JSON property values + // containing commas (e.g. {"language":"python","module":"glob"}) are not + // mis-parsed by Kuzu's CSV reader. Go's json.Marshal never emits '|', + // so it is unambiguous as a separator. w := csv.NewWriter(tmp) + w.Comma = '|' for _, n := range batch { row, err := encodeNodeRow(n) if err != nil { @@ -96,8 +101,9 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error { // Kuzu COPY FROM with explicit column list. ToSlash for Windows path // portability — Kuzu's parser accepts forward slashes on all platforms. + // DELIM='|' matches the pipe-separated staging file written above. q := fmt.Sprintf( - "COPY CodeNode(%s) FROM '%s' (header=false)", + "COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|')", strings.Join(nodeColumns, ", "), filepath.ToSlash(tmp.Name()), ) @@ -226,7 +232,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro } defer os.Remove(tmp.Name()) + // Use pipe '|' as the field delimiter — see copyNodeBatch for the rationale. w := csv.NewWriter(tmp) + w.Comma = '|' for _, e := range batch { props, err := json.Marshal(e.Properties) if err != nil { @@ -255,8 +263,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro return fmt.Errorf("graph: csv close: %w", err) } + // DELIM='|' matches the pipe-separated staging file written above. q := fmt.Sprintf( - "COPY %s FROM '%s' (header=false)", + "COPY %s FROM '%s' (header=false, DELIM='|')", relTableName(kind), filepath.ToSlash(tmp.Name()), ) diff --git a/go/internal/graph/bulk_test.go b/go/internal/graph/bulk_test.go index ea628bb1..2ae4f7cc 100644 --- a/go/internal/graph/bulk_test.go +++ b/go/internal/graph/bulk_test.go @@ -145,6 +145,84 @@ func TestBulkLoadEdgesGroupedByKind(t *testing.T) { } } +// TestBulkLoadEdgesCommaInProperties is a regression test for the bug where +// Properties JSON containing commas (e.g. {"language":"python","module":"glob"}) +// caused Kuzu's CSV parser to count more fields than expected and abort with +// "Copy exception: expected 6 values per row, but got more". The fix switches +// the staging file to pipe-separated (DELIM='|'), which is unambiguous because +// Go's json.Marshal never emits a '|' character. +func TestBulkLoadEdgesCommaInProperties(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + nodes := []*model.CodeNode{ + {ID: "py:file:check_structure.py", Kind: model.NodeModule, Label: "check_structure.py"}, + {ID: "py:external:glob", Kind: model.NodeExternal, Label: "glob"}, + } + if err := s.BulkLoadNodes(nodes); err != nil { + t.Fatal(err) + } + edges := []*model.CodeEdge{{ + ID: "py:file:check_structure.py->py:external:glob:imports", + Kind: model.EdgeImports, + SourceID: "py:file:check_structure.py", + TargetID: "py:external:glob", + Confidence: model.ConfidenceLexical, + Source: "GenericImportsDetector", + Properties: map[string]any{ + "language": "python", + "module": "glob", + }, + }} + if err := s.BulkLoadEdges(edges); err != nil { + t.Fatalf("BulkLoadEdges with comma-bearing Properties: %v", err) + } + rows, err := s.Cypher("MATCH ()-[r:IMPORTS]->() RETURN r.id AS id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 1 { + t.Fatalf("want 1 IMPORTS row, got %d: %v", len(rows), rows) + } +} + +// TestBulkLoadNodesCommaInProperties is a regression test for nodes whose +// props JSON column contains commas — same root cause as the edge variant. +func TestBulkLoadNodesCommaInProperties(t *testing.T) { + s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu")) + if err != nil { + t.Fatal(err) + } + defer s.Close() + if err := s.ApplySchema(); err != nil { + t.Fatal(err) + } + nodes := []*model.CodeNode{{ + ID: "py:file:app.py", + Kind: model.NodeModule, + Label: "app.py", + Properties: map[string]any{ + "language": "python", + "module": "flask,requests,os", // value itself contains commas + }, + }} + if err := s.BulkLoadNodes(nodes); err != nil { + t.Fatalf("BulkLoadNodes with comma-bearing Properties: %v", err) + } + rows, err := s.Cypher("MATCH (n:CodeNode {id: 'py:file:app.py'}) RETURN n.id AS id") + if err != nil { + t.Fatal(err) + } + if len(rows) != 1 { + t.Fatalf("want 1 node, got %d: %v", len(rows), rows) + } +} + // TestBulkLoadEdgesEmpty — zero edges is a no-op like the node path. func TestBulkLoadEdgesEmpty(t *testing.T) { s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))