From c1d1047f519017d62774b2b8cd8f373254843953 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:10:58 +0700 Subject: [PATCH 1/3] feat: add populateEmbedding option to search and searchByEmbedding Adds an optional populateEmbedding flag (default false) to VectorSearchQuery and VectorSearchEmbeddingQuery, threaded through the search handlers and the DbAdapter.search signature into all three adapters (pg, cf, mongodb) so each result can include its embedding vector, mirroring findByIds. Also fixes the mongodb adapter's search wrapper, which dropped the argument before reaching searchImpl. --- adapters/cf/dev/specs/adapter.spec.ts | 63 ++++++++++++++++++- adapters/cf/src/search.ts | 3 + adapters/mongodb/dev/specs/compliance.spec.ts | 28 +++++++++ adapters/mongodb/src/index.ts | 4 +- adapters/mongodb/src/search.ts | 13 +++- adapters/pg/dev/specs/compliance.spec.ts | 19 ++++++ adapters/pg/src/search.ts | 21 ++++++- dev/helpers/mockAdapter.ts | 2 + dev/specs/searchByEmbedding.spec.ts | 36 +++++++++++ dev/specs/vectorizedPayload.spec.ts | 32 ++++++++++ src/endpoints/vectorSearch.ts | 7 ++- src/index.ts | 2 + src/types.ts | 5 ++ 13 files changed, 226 insertions(+), 9 deletions(-) diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts index ee988be..ae75b53 100644 --- a/adapters/cf/dev/specs/adapter.spec.ts +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -14,7 +14,7 @@ function createMockCloudflareBinding() { return { query: vi.fn(async (queryVector: number[], options: any) => { - const { topK = 10, returnMetadata = false, where } = options + const { topK = 10, returnMetadata = false, returnValues = false, where } = options const results = Array.from(storage.values()) .filter((item) => { @@ -36,6 +36,7 @@ function createMockCloudflareBinding() { return { id: item.id, score, + values: returnValues ? item.values : undefined, metadata: returnMetadata ? item.metadata : undefined, } }) @@ -439,6 +440,66 @@ describe('createCloudflareVectorizeIntegration', () => { }) }) + describe('search', () => { + test('includes the embedding vector on each result when populateEmbedding is true', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + const embedding = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + + await adapter.storeChunk(mockPayload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'find me', + embeddingVersion: 'v1', + embedding, + extensionFields: { category: 'science' }, + }) + + const results = await adapter.search(mockPayload, embedding, 'default', 10, undefined, true) + expect(results).toHaveLength(1) + expect(results[0].embedding).toEqual(embedding) + expect(results[0].chunkText).toBe('find me') + expect((results[0] as any).category).toBe('science') + expect(mockBinding.query).toHaveBeenCalledWith( + embedding, + expect.objectContaining({ returnValues: true }), + ) + }) + + test('omits the embedding vector by default', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: DIMS } }, + binding: mockBinding as any, + }) + const mockPayload = createMockPayload(mockBinding) + const embedding = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + + await adapter.storeChunk(mockPayload, 'default', { + sourceCollection: 'posts', + docId: 'doc-1', + chunkIndex: 0, + chunkText: 'find me', + embeddingVersion: 'v1', + embedding, + extensionFields: {}, + }) + + const results = await adapter.search(mockPayload, embedding, 'default') + expect(results).toHaveLength(1) + expect(results[0].embedding).toBeUndefined() + expect(mockBinding.query).not.toHaveBeenCalledWith( + embedding, + expect.objectContaining({ returnValues: true }), + ) + }) + }) + describe('findByIds', () => { test('returns full EmbeddingRecord including embedding values when populateEmbedding is true', async () => { const mockBinding = createMockCloudflareBinding() diff --git a/adapters/cf/src/search.ts b/adapters/cf/src/search.ts index 4ee5b40..bcc2dcd 100644 --- a/adapters/cf/src/search.ts +++ b/adapters/cf/src/search.ts @@ -8,6 +8,7 @@ export default async ( poolName: KnowledgePoolName, limit: number = 10, where?: Where, + populateEmbedding = false, ): Promise> => { const vectorizeBinding = getVectorizeBinding(payload) @@ -15,6 +16,7 @@ export default async ( const queryOptions: Record = { topK: limit, returnMetadata: 'all' as const, + ...(populateEmbedding ? { returnValues: true } : {}), } let postFilter: Where | null = null @@ -48,6 +50,7 @@ export default async ( chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : parseInt(String(metadata.chunkIndex || '0'), 10), chunkText: String(metadata.chunkText || ''), embeddingVersion: String(metadata.embeddingVersion || ''), + ...(populateEmbedding ? { embedding: Array.from(match.values ?? []) } : {}), ...extensionFields, } }) diff --git a/adapters/mongodb/dev/specs/compliance.spec.ts b/adapters/mongodb/dev/specs/compliance.spec.ts index 2e798ea..abf8046 100644 --- a/adapters/mongodb/dev/specs/compliance.spec.ts +++ b/adapters/mongodb/dev/specs/compliance.spec.ts @@ -136,6 +136,34 @@ describe('Mongo Adapter Compliance Tests', () => { const results = await adapter.search(payload, target, 'default', 1) expect(results.length).toBeLessThanOrEqual(1) }) + + // Atlas vector search is eventually consistent: a freshly-seeded doc may not be + // queryable immediately, so poll until the index surfaces it before asserting. + const searchUntilNonEmpty = async (populateEmbedding: boolean) => { + for (let attempt = 0; attempt < 30; attempt++) { + const results = await adapter.search(payload, target, 'default', 10, undefined, populateEmbedding) + if (results.length > 0) return results + await new Promise((resolve) => setTimeout(resolve, 500)) + } + return adapter.search(payload, target, 'default', 10, undefined, populateEmbedding) + } + + test('includes the embedding vector on each result when populateEmbedding is true', async () => { + const results = await searchUntilNonEmpty(true) + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(Array.isArray(r.embedding)).toBe(true) + expect((r.embedding as number[]).length).toBe(DIMS) + } + }) + + test('omits the embedding vector by default', async () => { + const results = await searchUntilNonEmpty(false) + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(r.embedding).toBeUndefined() + } + }) }) describe('deleteChunks()', () => { diff --git a/adapters/mongodb/src/index.ts b/adapters/mongodb/src/index.ts index 5a0d2d2..f8d9260 100644 --- a/adapters/mongodb/src/index.ts +++ b/adapters/mongodb/src/index.ts @@ -88,8 +88,8 @@ export const createMongoVectorIntegration = ( return count > 0 }, - search: (payload, queryEmbedding, poolName, limit, where) => - searchImpl(getCtx(), payload, queryEmbedding, poolName, limit, where), + search: (payload, queryEmbedding, poolName, limit, where, populateEmbedding) => + searchImpl(getCtx(), payload, queryEmbedding, poolName, limit, where, populateEmbedding), findByIds: (payload, poolName, ids, populateEmbedding) => findByIdsImpl(getCtx(), payload, poolName, ids, populateEmbedding), diff --git a/adapters/mongodb/src/search.ts b/adapters/mongodb/src/search.ts index 508c944..ab3f44d 100644 --- a/adapters/mongodb/src/search.ts +++ b/adapters/mongodb/src/search.ts @@ -26,6 +26,7 @@ export async function searchImpl( poolName: string, limit: number = 10, where?: Where, + populateEmbedding = false, ): Promise { const pool = ctx.pools[poolName] if (!pool) { @@ -64,7 +65,7 @@ export async function searchImpl( const pipeline: Record[] = [ { $vectorSearch: vectorSearchStage }, { $addFields: { score: { $meta: 'vectorSearchScore' } } }, - { $project: { embedding: 0 } }, + ...(populateEmbedding ? [] : [{ $project: { embedding: 0 } }]), ] const collection = client.db(ctx.dbName).collection(pool.collectionName) @@ -74,10 +75,13 @@ export async function searchImpl( ? rawDocs.filter((d) => evaluatePostFilter(d as Record, postFilter!)) : rawDocs - return filtered.map((d) => mapDocToResult(d as Record)) + return filtered.map((d) => mapDocToResult(d as Record, populateEmbedding)) } -function mapDocToResult(doc: Record): VectorSearchResult { +function mapDocToResult( + doc: Record, + populateEmbedding: boolean, +): VectorSearchResult { if (typeof doc.score !== 'number') { throw new Error( `[@payloadcms-vectorize/mongodb] Search result is missing numeric "score" field; ensure the pipeline adds { score: { $meta: 'vectorSearchScore' } }`, @@ -95,6 +99,9 @@ function mapDocToResult(doc: Record): VectorSearchResult { typeof doc.chunkIndex === 'number' ? doc.chunkIndex : Number(doc.chunkIndex ?? 0), chunkText: String(doc.chunkText ?? ''), embeddingVersion: String(doc.embeddingVersion ?? ''), + ...(populateEmbedding + ? { embedding: Array.isArray(doc.embedding) ? (doc.embedding as number[]) : [] } + : {}), ...extensionFields, } as VectorSearchResult } diff --git a/adapters/pg/dev/specs/compliance.spec.ts b/adapters/pg/dev/specs/compliance.spec.ts index 5a28f72..7d270f7 100644 --- a/adapters/pg/dev/specs/compliance.spec.ts +++ b/adapters/pg/dev/specs/compliance.spec.ts @@ -221,6 +221,25 @@ describe('Postgres Adapter Compliance Tests', () => { expect(results.length).toBeLessThanOrEqual(1) }) + + test('includes the embedding vector on each result when populateEmbedding is true', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 10, undefined, true) + + expect(results.length).toBeGreaterThan(0) + for (const result of results) { + expect(Array.isArray(result.embedding)).toBe(true) + expect((result.embedding as number[]).length).toBe(DIMS) + } + }) + + test('omits the embedding vector by default', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 10) + + expect(results.length).toBeGreaterThan(0) + for (const result of results) { + expect(result.embedding).toBeUndefined() + } + }) }) describe('deleteChunks()', () => { diff --git a/adapters/pg/src/search.ts b/adapters/pg/src/search.ts index 54dcc20..14a08b0 100644 --- a/adapters/pg/src/search.ts +++ b/adapters/pg/src/search.ts @@ -26,6 +26,7 @@ export default async ( poolName: KnowledgePoolName, limit: number = 10, where?: Where, + populateEmbedding = false, ): Promise> => { const isPostgres = payload.db?.pool?.query || payload.db?.drizzle @@ -100,6 +101,9 @@ export default async ( id: table.id, // ensure we select id explicitly score: sql`1 - (${distanceExpr})`, } + if (populateEmbedding) { + selectObj.embedding = table.embedding + } // Add reserved + extension fields from collection config for (const field of collectionConfig.fields ?? []) { @@ -127,7 +131,7 @@ export default async ( // Execute the query const result = await query - return mapRowsToResults(result, collectionConfig) + return mapRowsToResults(result, collectionConfig, populateEmbedding) } /** @@ -284,6 +288,7 @@ function convertWhereToDrizzle(where: Where, table: DrizzleTable, fields: Flatte function mapRowsToResults( rows: Record[], collectionConfig: SanitizedCollectionConfig, + populateEmbedding: boolean, ): Array { // Collect names of fields that are typed as number on the collection const numberFields = new Set() @@ -310,6 +315,7 @@ function mapRowsToResults( typeof rawChunkIndex === 'number' ? rawChunkIndex : parseInt(String(rawChunkIndex), 10), chunkText: String(row.chunkText ?? ''), embeddingVersion: String(row.embeddingVersion ?? ''), + ...(populateEmbedding ? { embedding: parseEmbedding(row.embedding) } : {}), } as VectorSearchResult // Ensure any number fields from the schema are numbers in the result @@ -326,3 +332,16 @@ function mapRowsToResults( return result }) } + +function parseEmbedding(value: unknown): number[] { + if (Array.isArray(value)) return value as number[] + if (typeof value === 'string') { + return value + .replace(/^\[/, '') + .replace(/\]$/, '') + .split(',') + .filter((s) => s.length > 0) + .map((s) => Number(s)) + } + return [] +} diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts index 0659f1d..82331ff 100644 --- a/dev/helpers/mockAdapter.ts +++ b/dev/helpers/mockAdapter.ts @@ -145,6 +145,7 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = poolName: string, limit: number = 10, where?: Where, + populateEmbedding = false, ): Promise => { const results: Array = [] @@ -181,6 +182,7 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = id: stored.id, score, _score: score, // For sorting + ...(populateEmbedding ? { embedding: stored.embedding } : {}), ...docFields, // Includes sourceCollection, docId, chunkText, embeddingVersion, AND extension fields }) } diff --git a/dev/specs/searchByEmbedding.spec.ts b/dev/specs/searchByEmbedding.spec.ts index a3f38ee..f88930a 100644 --- a/dev/specs/searchByEmbedding.spec.ts +++ b/dev/specs/searchByEmbedding.spec.ts @@ -167,6 +167,42 @@ describe('searchByEmbedding method tests', () => { expectResultsOrderedByScore(results) }) + test('searchByEmbedding includes the embedding vector when populateEmbedding is true', async () => { + const queryEmbedding = await embedFn(titleAndQuery) + const embeddingArray = Array.isArray(queryEmbedding) + ? queryEmbedding + : Array.from(queryEmbedding) + + const results = await vectorizedPayload.searchByEmbedding({ + knowledgePool: 'default', + embedding: embeddingArray, + populateEmbedding: true, + }) + + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(Array.isArray(r.embedding)).toBe(true) + expect((r.embedding as number[]).length).toBe(DIMS) + } + }) + + test('searchByEmbedding omits the embedding vector by default', async () => { + const queryEmbedding = await embedFn(titleAndQuery) + const embeddingArray = Array.isArray(queryEmbedding) + ? queryEmbedding + : Array.from(queryEmbedding) + + const results = await vectorizedPayload.searchByEmbedding({ + knowledgePool: 'default', + embedding: embeddingArray, + }) + + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(r.embedding).toBeUndefined() + } + }) + test('searchByEmbedding respects limit parameter', async () => { // Get the embedding for our query const queryEmbedding = await embedFn(titleAndQuery) diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index f4c36b4..28f1644 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -198,6 +198,38 @@ describe('VectorizedPayload', () => { expectResultsContainTitle(results, titleAndQuery, postId, testEmbeddingVersion) }) + + test('includes the embedding vector on each result when populateEmbedding is true', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + + const results = await vectorizedPayload.search({ + query: titleAndQuery, + knowledgePool: 'default', + limit: 5, + populateEmbedding: true, + }) + + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(Array.isArray(r.embedding)).toBe(true) + expect((r.embedding as number[]).length).toBe(DIMS) + } + }) + + test('omits the embedding vector by default', async () => { + const vectorizedPayload = getVectorizedPayload(payload)! + + const results = await vectorizedPayload.search({ + query: titleAndQuery, + knowledgePool: 'default', + limit: 5, + }) + + expect(results.length).toBeGreaterThan(0) + for (const r of results) { + expect(r.embedding).toBeUndefined() + } + }) }) describe('findByIds method', () => { diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index d238a1c..80fc81f 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -17,6 +17,7 @@ export const createVectorSearchHandlers = ( knowledgePool: KnowledgePoolName, limit?: number, where?: Where, + populateEmbedding?: boolean, ) => { const poolConfig = knowledgePools[knowledgePool] const queryEmbedding = await (async () => { @@ -27,7 +28,7 @@ export const createVectorSearchHandlers = ( const rerank = poolConfig.embeddingConfig.rerank if (!rerank) { - return adapter.search(payload, queryEmbedding, knowledgePool, limit, where) + return adapter.search(payload, queryEmbedding, knowledgePool, limit, where, populateEmbedding) } const effectiveLimit = limit ?? 10 @@ -39,6 +40,7 @@ export const createVectorSearchHandlers = ( knowledgePool, fetchLimit, where, + populateEmbedding, ) const reranked = await rerank.callback(query, candidates) @@ -51,8 +53,9 @@ export const createVectorSearchHandlers = ( knowledgePool: KnowledgePoolName, limit?: number, where?: Where, + populateEmbedding?: boolean, ) => { - return adapter.search(payload, embedding, knowledgePool, limit, where) + return adapter.search(payload, embedding, knowledgePool, limit, where, populateEmbedding) } const requestHandler: PayloadHandler = async (req) => { diff --git a/src/index.ts b/src/index.ts index f716c0d..01c66de 100644 --- a/src/index.ts +++ b/src/index.ts @@ -358,6 +358,7 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => params.knowledgePool, params.limit, params.where, + params.populateEmbedding, ), searchByEmbedding: (params: VectorSearchEmbeddingQuery) => vectorSearchHandlers.searchByEmbedding( @@ -366,6 +367,7 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => params.knowledgePool, params.limit, params.where, + params.populateEmbedding, ), findByIds: (params: { knowledgePool: KnowledgePoolName diff --git a/src/types.ts b/src/types.ts index e414e9b..8778d5b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -348,6 +348,8 @@ export interface VectorSearchQuery { where?: Where /** Optional limit for number of results (default: 10) */ limit?: number + /** When true, include the embedding vector on each result (default: false) */ + populateEmbedding?: boolean } export interface VectorSearchEmbeddingQuery { @@ -359,6 +361,8 @@ export interface VectorSearchEmbeddingQuery { where?: Where /** Optional limit for number of results (default: 10) */ limit?: number + /** When true, include the embedding vector on each result (default: false) */ + populateEmbedding?: boolean } // ========================================== @@ -457,6 +461,7 @@ export type DbAdapter = { poolName: KnowledgePoolName, limit?: number, where?: Where, + populateEmbedding?: boolean, ) => Promise> findByIds: ( payload: BasePayload, From 3076698bb086199f4ac5e45bb3b87551b332e212 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:05:52 +0700 Subject: [PATCH 2/3] refactor: DRY parseEmbedding and type embedding on VectorSearchResult - Extract the duplicated pg parseEmbedding into a shared module imported by both search.ts and findByIds.ts. - Declare embedding?: number[] explicitly on VectorSearchResult and drop the `as number[]` casts from the populate-embedding tests. - Note in the HTTP search handler that populateEmbedding is intentionally programmatic-only. --- adapters/mongodb/dev/specs/compliance.spec.ts | 2 +- adapters/pg/dev/specs/compliance.spec.ts | 2 +- adapters/pg/src/findByIds.ts | 14 +------------- adapters/pg/src/parseEmbedding.ts | 12 ++++++++++++ adapters/pg/src/search.ts | 14 +------------- dev/specs/searchByEmbedding.spec.ts | 2 +- dev/specs/vectorizedPayload.spec.ts | 2 +- src/endpoints/vectorSearch.ts | 2 ++ src/types.ts | 1 + 9 files changed, 21 insertions(+), 30 deletions(-) create mode 100644 adapters/pg/src/parseEmbedding.ts diff --git a/adapters/mongodb/dev/specs/compliance.spec.ts b/adapters/mongodb/dev/specs/compliance.spec.ts index abf8046..dabc7f7 100644 --- a/adapters/mongodb/dev/specs/compliance.spec.ts +++ b/adapters/mongodb/dev/specs/compliance.spec.ts @@ -153,7 +153,7 @@ describe('Mongo Adapter Compliance Tests', () => { expect(results.length).toBeGreaterThan(0) for (const r of results) { expect(Array.isArray(r.embedding)).toBe(true) - expect((r.embedding as number[]).length).toBe(DIMS) + expect(r.embedding?.length).toBe(DIMS) } }) diff --git a/adapters/pg/dev/specs/compliance.spec.ts b/adapters/pg/dev/specs/compliance.spec.ts index 7d270f7..1278693 100644 --- a/adapters/pg/dev/specs/compliance.spec.ts +++ b/adapters/pg/dev/specs/compliance.spec.ts @@ -228,7 +228,7 @@ describe('Postgres Adapter Compliance Tests', () => { expect(results.length).toBeGreaterThan(0) for (const result of results) { expect(Array.isArray(result.embedding)).toBe(true) - expect((result.embedding as number[]).length).toBe(DIMS) + expect(result.embedding?.length).toBe(DIMS) } }) diff --git a/adapters/pg/src/findByIds.ts b/adapters/pg/src/findByIds.ts index 9e51aeb..7799356 100644 --- a/adapters/pg/src/findByIds.ts +++ b/adapters/pg/src/findByIds.ts @@ -3,6 +3,7 @@ import { BasePayload, SanitizedCollectionConfig } from 'payload' import { KnowledgePoolName, EmbeddingRecord } from 'payloadcms-vectorize' import toSnakeCase from 'to-snake-case' import { getEmbeddingsTable } from './drizzle.js' +import { parseEmbedding } from './parseEmbedding.js' export default async ( payload: BasePayload, @@ -119,16 +120,3 @@ function mapRowsToRecords( return record }) } - -function parseEmbedding(value: unknown): number[] { - if (Array.isArray(value)) return value as number[] - if (typeof value === 'string') { - return value - .replace(/^\[/, '') - .replace(/\]$/, '') - .split(',') - .filter((s) => s.length > 0) - .map((s) => Number(s)) - } - return [] -} diff --git a/adapters/pg/src/parseEmbedding.ts b/adapters/pg/src/parseEmbedding.ts new file mode 100644 index 0000000..4c07fbe --- /dev/null +++ b/adapters/pg/src/parseEmbedding.ts @@ -0,0 +1,12 @@ +export function parseEmbedding(value: unknown): number[] { + if (Array.isArray(value)) return value as number[] + if (typeof value === 'string') { + return value + .replace(/^\[/, '') + .replace(/\]$/, '') + .split(',') + .filter((s) => s.length > 0) + .map((s) => Number(s)) + } + return [] +} diff --git a/adapters/pg/src/search.ts b/adapters/pg/src/search.ts index 14a08b0..df8dad9 100644 --- a/adapters/pg/src/search.ts +++ b/adapters/pg/src/search.ts @@ -19,6 +19,7 @@ import { BasePayload, Where, SanitizedCollectionConfig, FlattenedField } from 'p import { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' import toSnakeCase from 'to-snake-case' import { getEmbeddingsTable } from './drizzle.js' +import { parseEmbedding } from './parseEmbedding.js' export default async ( payload: BasePayload, @@ -332,16 +333,3 @@ function mapRowsToResults( return result }) } - -function parseEmbedding(value: unknown): number[] { - if (Array.isArray(value)) return value as number[] - if (typeof value === 'string') { - return value - .replace(/^\[/, '') - .replace(/\]$/, '') - .split(',') - .filter((s) => s.length > 0) - .map((s) => Number(s)) - } - return [] -} diff --git a/dev/specs/searchByEmbedding.spec.ts b/dev/specs/searchByEmbedding.spec.ts index f88930a..1d4a86d 100644 --- a/dev/specs/searchByEmbedding.spec.ts +++ b/dev/specs/searchByEmbedding.spec.ts @@ -182,7 +182,7 @@ describe('searchByEmbedding method tests', () => { expect(results.length).toBeGreaterThan(0) for (const r of results) { expect(Array.isArray(r.embedding)).toBe(true) - expect((r.embedding as number[]).length).toBe(DIMS) + expect(r.embedding?.length).toBe(DIMS) } }) diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 28f1644..52f4b4b 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -212,7 +212,7 @@ describe('VectorizedPayload', () => { expect(results.length).toBeGreaterThan(0) for (const r of results) { expect(Array.isArray(r.embedding)).toBe(true) - expect((r.embedding as number[]).length).toBe(DIMS) + expect(r.embedding?.length).toBe(DIMS) } }) diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 80fc81f..2f6edcc 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -84,6 +84,8 @@ export const createVectorSearchHandlers = ( const payload = req.payload + // populateEmbedding is intentionally not exposed over HTTP — it's a programmatic-only + // option, kept out of the REST response to avoid shipping large vectors over the wire. const results = await vectorSearch(payload, query, knowledgePool, limit, where) return Response.json({ results }) diff --git a/src/types.ts b/src/types.ts index 8778d5b..02bf75f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -325,6 +325,7 @@ export interface VectorSearchResult { chunkIndex: number // The index of this chunk chunkText: string // The original text that was vectorized embeddingVersion: string // The version of the embedding model used + embedding?: number[] // Present only when the query requested populateEmbedding [key: string]: any // Extension fields and other dynamic fields } From 7e3a325a99537ad4c4d22322440a9c61ef95ef3a Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:34:58 +0700 Subject: [PATCH 3/3] docs: document populateEmbedding on search and searchByEmbedding - Root README: add populateEmbedding to the search/searchByEmbedding Local API params and note it is Local-API-only (the REST endpoint never returns vectors); fix the findByIds note that claimed search never returns the vector. - Adapters README: add populateEmbedding to the DbAdapter.search signature, search-path lifecycle, and method-reference row; add embedding? to the VectorSearchResult type; correct the EmbeddingRecord notes that said search never returns embeddings. --- README.md | 8 +++++--- adapters/README.md | 14 +++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index dcd1b64..98330a4 100644 --- a/README.md +++ b/README.md @@ -866,7 +866,9 @@ if (vectorizedPayload) { #### `vectorizedPayload.search(params)` -Perform vector search programmatically without making an HTTP request. Parameters and result shape are identical to [POST `/api/vector-search`](#post-apivector-search). If the pool has a [`rerank`](#reranking-optional) config, this call goes through the same rerank pipeline as the REST endpoint. +Perform vector search programmatically without making an HTTP request. The result shape is identical to [POST `/api/vector-search`](#post-apivector-search). If the pool has a [`rerank`](#reranking-optional) config, this call goes through the same rerank pipeline as the REST endpoint. + +**Params:** `{ knowledgePool: string; query: string; where?: Where; limit?: number; populateEmbedding?: boolean }` (`limit` defaults to `10`, `populateEmbedding` to `false`). Set `populateEmbedding: true` to include each result's raw `embedding` vector — handy for feeding straight into [`searchByEmbedding()`](#vectorizedpayloadsearchbyembeddingparams). This option is **Local API only**: the REST endpoint never returns vectors, so it is the one parameter not shared with [POST `/api/vector-search`](#post-apivector-search). **Returns:** `Promise>` — the array that the REST endpoint wraps in `{ results }`. @@ -891,7 +893,7 @@ Unlike [`search()`](#vectorizedpayloadsearchparams), this method does **not** ru There is no REST equivalent; `searchByEmbedding` is Local API only. -**Params:** `{ knowledgePool: string; embedding: number[]; where?: Where; limit?: number }` (`limit` defaults to `10`). +**Params:** `{ knowledgePool: string; embedding: number[]; where?: Where; limit?: number; populateEmbedding?: boolean }` (`limit` defaults to `10`, `populateEmbedding` to `false`). As with [`search()`](#vectorizedpayloadsearchparams), `populateEmbedding: true` includes each result's raw `embedding` vector and is Local API only. **Returns:** `Promise>` — the same array shape as `search()`. @@ -916,7 +918,7 @@ if (seed?.embedding) { #### `vectorizedPayload.findByIds(params)` -Fetch stored embedding records by primary key. The `id` of each record is whatever [`search()`](#vectorizedpayloadsearchparams) returns as `result.id`, so a search result round-trips directly. Pass `populateEmbedding: true` to also get the raw embedding vector back (the normal search/query API never returns it) — the building block for "more like this" flows. It defaults to `false`, so by default you get the record's text and metadata without the heavy vector. +Fetch stored embedding records by primary key. The `id` of each record is whatever [`search()`](#vectorizedpayloadsearchparams) returns as `result.id`, so a search result round-trips directly. Pass `populateEmbedding: true` to also get the raw embedding vector back (it is omitted by default) — the building block for "more like this" flows. It defaults to `false`, so by default you get the record's text and metadata without the heavy vector. **Params:** `{ knowledgePool: string; ids: string[]; populateEmbedding?: boolean }` (`populateEmbedding` defaults to `false`). diff --git a/adapters/README.md b/adapters/README.md index 0bef798..1914af0 100644 --- a/adapters/README.md +++ b/adapters/README.md @@ -86,7 +86,7 @@ For each document write in a collection registered to a knowledge pool: 1. A consumer calls either `POST /api/vector-search` or `getVectorizedPayload(payload).search({ knowledgePool, query, where, limit })`. 2. The plugin calls the configured `queryFn(query)` to embed the query string. -3. The plugin calls **`adapter.search(payload, queryEmbedding, poolName, limit, where)`**. +3. The plugin calls **`adapter.search(payload, queryEmbedding, poolName, limit, where, populateEmbedding)`**. 4. The plugin returns the array of `VectorSearchResult` to the caller, untransformed. **Your adapter is responsible for translating Payload-style `where` clauses** into your store's filter language. See [Common pitfalls](#common-pitfalls). @@ -151,6 +151,7 @@ export type DbAdapter = { poolName: KnowledgePoolName, limit?: number, where?: Where, + populateEmbedding?: boolean, ) => Promise> findByIds: ( @@ -170,7 +171,7 @@ export type DbAdapter = { | `storeChunk` | Per chunk during real-time ingest **and** per output during bulk completion. | Persist the embedding plus all fields in `StoreChunkData` (including `extensionFields`) so they are queryable from `search`. Idempotency is **not** guaranteed by the plugin — you may receive duplicate calls on retry. | | `deleteChunks` | After a source document is deleted. | Remove every chunk where `sourceCollection === ... && docId === ...`. Must be safe to call when no chunks exist (no-op, no throw). | | `hasEmbeddingVersion` | During bulk-embed planning, per candidate document. | Return `true` iff at least one chunk exists with the matching `(sourceCollection, docId, embeddingVersion)` triple. Must filter on **all three** — older `0.7.0` adapters that ignored `embeddingVersion` caused stale embeddings on model bumps. | -| `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. | +| `search` | Per `/vector-search` request and per `getVectorizedPayload().search()` call. | Translate `where` (Payload-style) into your store's filter language, perform a vector search using `queryEmbedding`, and return up to `limit` results sorted by descending relevance. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF returns it only when you pass `returnValues: true`, so request it just for the populated case. | | `findByIds` | Per `getVectorizedPayload().findByIds()` call. | Fetch stored embedding records by primary key. **Return an object keyed by the ids you were given:** every requested id must be present as a key, with a found record as the value and `undefined` for any id that didn't resolve. The raw `embedding` vector is **only included when `populateEmbedding` is `true`** (default `false`) — omit it otherwise so callers that only need text/metadata don't pay for it. Where possible, skip reading the vector at the source (pg: don't select the column; MongoDB: `{ projection: { embedding: 0 } }`); CF's `getByIds` always returns values, so omit them post-fetch. Look up by the same `id` your `search` returns as `result.id`. Unknown **and** malformed ids must map to `undefined` — never throw for a bad id. Validate the id shape against your key type before querying so a malformed id can't error the whole batch (MongoDB drops non-24-hex ids; pg drops ids that don't match the PK column type — numeric for integer PKs, uuid-shaped for `uuid` PKs — before the `IN` query; CF's ids are arbitrary strings, so an unknown one is simply absent from `getByIds`). Empty `ids` returns `{}` without a backend call. | ### Error contract @@ -376,6 +377,9 @@ export interface VectorSearchResult { chunkText: string /** Embedding model/version string. */ embeddingVersion: string + /** The raw embedding vector — only present when `search` is called with + * `populateEmbedding: true` (default `false`). */ + embedding?: number[] /** Any extensionFields persisted via storeChunk must round-trip here. */ [key: string]: any } @@ -393,8 +397,8 @@ export interface EmbeddingRecord { chunkText: string /** Embedding model/version string. */ embeddingVersion: string - /** The raw embedding vector — never returned by `search`, and only present - * when `findByIds` is called with `populateEmbedding: true`. */ + /** The raw embedding vector — only present when `findByIds` is called with + * `populateEmbedding: true`. */ embedding?: number[] /** Any extensionFields persisted via storeChunk round-trip here. */ [key: string]: any @@ -409,7 +413,7 @@ export interface EmbeddingRecord { | `chunkText`, `embeddingVersion` | yes | Same. | | `extensionFields.*` | optional | Whatever the user passed in `extensionFields` must be queryable via `where`. | -> `EmbeddingRecord` (returned by `findByIds`) is `VectorSearchResult` without `score` and with an optional raw `embedding?: number[]` — present only when `findByIds` is called with `populateEmbedding: true`. +> `EmbeddingRecord` (returned by `findByIds`) is `VectorSearchResult` without `score`. Both carry an optional raw `embedding?: number[]`, present only when the call requested it via `populateEmbedding: true`. ## Testing your adapter