From 4a268a13c34ea1579862292094b21c0bc7cb33de Mon Sep 17 00:00:00 2001 From: JPark1023 Date: Wed, 4 Mar 2026 10:41:07 -0600 Subject: [PATCH 1/3] Add ResourceEmbedding prisma table --- backend/prisma/schema.prisma | 50 ++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/backend/prisma/schema.prisma b/backend/prisma/schema.prisma index b362907..5457af1 100644 --- a/backend/prisma/schema.prisma +++ b/backend/prisma/schema.prisma @@ -1,11 +1,13 @@ generator client { - provider = "prisma-client-js" + provider = "prisma-client-js" + previewFeatures = ["postgresqlExtensions"] } datasource db { - provider = "postgresql" - url = env("DATABASE_URL") - directUrl = env("DIRECT_URL") + provider = "postgresql" + url = env("DATABASE_URL") + directUrl = env("DIRECT_URL") + extensions = [pgvector(map: "vector", schema: "extensions")] } enum RESOURCE_TYPE { @@ -75,7 +77,8 @@ model Resource { externalResources ExternalResources? collectionItems CollectionItem[] internalResource InternalHostedResources? - image_s3_key String? + image_s3_key String? + ResourceEmbedding ResourceEmbedding? @@map("resource") } @@ -84,7 +87,7 @@ model ResourceLabel { id String @id @default(cuid()) resource_id String label_id String - resource Resource @relation(fields: [resource_id], references: [id], onDelete:Cascade) + resource Resource @relation(fields: [resource_id], references: [id], onDelete: Cascade) label CategoryLabel @relation(fields: [label_id], references: [id]) @@unique([resource_id, label_id]) @@ -100,6 +103,20 @@ model ExternalResources { @@map("externalResources") } +model ResourceEmbedding { + id String @id @default(cuid()) + resource_id String @unique + embedding Unsupported("vector(1536)") + dims Int @default(1536) + chunk_text String + chunk_index Int + created_at DateTime @default(now()) + updated_at DateTime @updatedAt + resource Resource @relation(fields: [resource_id], references: [id], onDelete: Cascade) + + @@map("resourceEmbedding") +} + enum RELATIONSHIP_TYPE { MOTHER FATHER @@ -119,11 +136,11 @@ enum HOUSEHOLD_TYPE { } model Parent { - id String @id @default(cuid()) - clerk_id String @unique + id String @id @default(cuid()) + clerk_id String @unique first_name String? last_name String? - email String @unique + email String @unique relationship RELATIONSHIP_TYPE? household_type HOUSEHOLD_TYPE? @@ -131,12 +148,12 @@ model Parent { kids_age_groups AGE_GROUP[] subscribed_newsletter Boolean @default(false) - created_at DateTime @default(now()) - updated_at DateTime @updatedAt - onboarding_complete Boolean @default(false) + created_at DateTime @default(now()) + updated_at DateTime @updatedAt + onboarding_complete Boolean @default(false) + collections Collection[] - collections Collection[] @@map("parent") } @@ -147,8 +164,8 @@ model Collection { created_at DateTime @default(now()) updated_at DateTime @updatedAt - parent Parent @relation(fields: [parent_fk], references: [id], onDelete: Cascade) - items CollectionItem[] + parent Parent @relation(fields: [parent_fk], references: [id], onDelete: Cascade) + items CollectionItem[] @@unique([parent_fk, name]) @@map("collection") @@ -176,7 +193,6 @@ model InternalHostedResources { @@map("internalHostedResources") } - model AdminUser { id String @id @default(cuid()) clerk_id String @unique @@ -203,4 +219,4 @@ model AdminLog { @@index([admin_fk]) @@map("adminlog") -} \ No newline at end of file +} From 3cad9a58213c6b1f8b2f4a30c164c3f56e3fdfae Mon Sep 17 00:00:00 2001 From: JPark1023 Date: Wed, 4 Mar 2026 13:27:55 -0600 Subject: [PATCH 2/3] Add resourceChunker service Performs formatting to support embedding --- backend/services/rag/resourceChunker.ts | 175 ++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 backend/services/rag/resourceChunker.ts diff --git a/backend/services/rag/resourceChunker.ts b/backend/services/rag/resourceChunker.ts new file mode 100644 index 0000000..044e629 --- /dev/null +++ b/backend/services/rag/resourceChunker.ts @@ -0,0 +1,175 @@ +type CategoryType = + | 'PARENTING_SKILLS_RELATIONSHIPS' + | 'CHILD_DEVELOPMENT' + | 'MENTAL_EMOTIONAL_HEALTH' + | 'SAFETY_PROTECTION' + | 'EDUCATION_LEARNING' + | 'HEALTH_WELLBEING' + | 'LIFE_SKILLS_INDEPENDENCE' + | 'FAMILY_SUPPORT_COMMUNITY'; + +type AgeGroup = + | 'AGE_0_3' + | 'AGE_4_6' + | 'AGE_7_10' + | 'AGE_11_13' + | 'AGE_14_18' + | 'AGE_18_ABOVE'; + +type Language = 'ENGLISH' | 'SPANISH' | 'OTHER'; + +type ResourceType = + | 'PDF' + | 'TXT' + | 'VIDEO' + | 'WEBINAR' + | 'WEBPAGE' + | 'INTERACTIVE_QUIZ' + | 'OTHER'; + +type CategoryLabel = { + id: string; + label_name: string; + category: CategoryType; +}; + +type ResourceLabel = { + id: string; + resource_id: string; + label_id: string; + label: CategoryLabel; +}; + +type ExternalResources = { + external_url: string; +}; + +type Resource = { + id: string; + title: string; + description?: string | null; + category: CategoryType; + ageGroups: AgeGroup[]; + language: Language; + resourceType: ResourceType; + timeToRead: number; + labels: ResourceLabel[]; + image_s3_key?: string | null; + externalResources?: ExternalResources | null; +}; + +type Chunk = { + index: number; + content: string; +} + +type ChunkedResource = { + resourceId: string; + chunkIndex: number; + content: string; +} + +const AGE_GROUP_ORDER: AgeGroup[] = [ + 'AGE_0_3', + 'AGE_4_6', + 'AGE_7_10', + 'AGE_11_13', + 'AGE_14_18', + 'AGE_18_ABOVE', +]; + +// Based on ordered list, returns a corresponding comparator +function orderComparator(order: T[]) { + const index = new Map(); + order.forEach((v, i) => index.set(v, i)); + return (a: T, b: T) => { + const ia = index.has(a) ? index.get(a)! : Number.POSITIVE_INFINITY; + const ib = index.has(b) ? index.get(b)! : Number.POSITIVE_INFINITY; + if (ia !== ib) return ia - ib; + return a.localeCompare(b); + }; +} + +const compareAgeGroups = orderComparator(AGE_GROUP_ORDER); + +/** + * Builds resource embedding text + * + * @param resource a Resource object as described in prisma schema + * @returns string -> The embedding text + */ +export function buildResourceEmbeddingText(resource: Resource) : string { + const title = resource.title.trim(); + const description = resource.description ? resource.description.trim() : ''; + const category = resource.category; + const resourceType = resource.resourceType; + const language = resource.language; + const timeToRead = String(resource.timeToRead); + const ageGroups = Array.isArray(resource.ageGroups) ? [...resource.ageGroups].sort(compareAgeGroups) : []; + const labels = + Array.isArray(resource.labels) && resource.labels.length + ? resource.labels.map((rl) => (rl.label?.label_name || '').trim()).filter(Boolean).sort() + : []; + + const externalUrl = resource.externalResources?.external_url || ''; + + const parts: string[] = []; + parts.push('=== Resource ==='); + parts.push(`Title: ${title}`); + if (description) parts.push(`Description: ${description}`); + parts.push(`Category: ${category}`); + parts.push(`Age Groups: ${ageGroups.join(', ')}`); + parts.push(`Language: ${language}`); + parts.push(`Resource Type: ${resourceType}`); + parts.push(`Time to Read: ${timeToRead} minutes `); + parts.push(`labels: ${labels.join(', ')}`); + if (externalUrl) parts.push(`External URL: ${externalUrl}`); + parts.push('=== End ==='); + + return parts.join('\n'); +} + +/** + * Performs fixed character-based chunking on given test + * + * @param text The text to chunk + * @param chunkSize The number of characters each chunk is maximum + * @param overlap The amount of overlap between chunks + * @returns List of chunk objects containing the index of chunk and content + */ +export function chunkText(text: string, chunkSize = 700, overlap = 100): Chunk[] { + if (typeof text !== 'string' || !text) return []; + const size = Math.max(1, Math.floor(chunkSize)); + const ov = Math.max(0, Math.floor(overlap)); + + const chunks: Chunk[] = []; + const step = size - ov > 0 ? size - ov : 1; + + let index = 0; + for (let i = 0; i < text.length; i += step) { + chunks.push({ index: index++, content: text.slice(i, i + size) }); + } + + return chunks; +} + +/** + * Chunks a given resource object. Resource objects are based on resources + * as described in the prisma schema. + * + * @param resource The resource (described in prisma schema) to chunk + * @param chunkSize Size of chunk + * @param overlap Size of overlap between chunks + * @returns a list of ChunkedResource objects containing id of resource, chunk index, and content. + */ +export function chunkResource(resource: Resource, chunkSize=700, overlap=100) : ChunkedResource[] { + const embeddingText = buildResourceEmbeddingText(resource); + const chunks: Chunk[] = chunkText(embeddingText, chunkSize, overlap); + const id = resource.id; + const chunkedResources: ChunkedResource[] = []; + for (const chunk of chunks) { + chunkedResources.push({resourceId: id, chunkIndex: chunk.index, content: chunk.content}); + } + + return chunkedResources; +} \ No newline at end of file From 562e51f522e29f6b589975a582c0f64fc5e21540 Mon Sep 17 00:00:00 2001 From: JPark1023 Date: Thu, 5 Mar 2026 21:43:17 -0600 Subject: [PATCH 3/3] resourceChunker unit tests --- backend/tests/resourceChunker.tests.ts | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 backend/tests/resourceChunker.tests.ts diff --git a/backend/tests/resourceChunker.tests.ts b/backend/tests/resourceChunker.tests.ts new file mode 100644 index 0000000..13ac3b7 --- /dev/null +++ b/backend/tests/resourceChunker.tests.ts @@ -0,0 +1,90 @@ +import { buildResourceEmbeddingText, chunkText, chunkResource } from '../services/rag/resourceChunker'; + +describe('resourceChunker', () => { + test('buildResourceEmbeddingText includes fields and sorts labels & ageGroups', () => { + const resource = { + id: 'r1', + title: ' My Title ', + description: ' A description. ', + category: 'CHILD_DEVELOPMENT', + ageGroups: ['AGE_14_18', 'AGE_4_6'], + language: 'ENGLISH', + resourceType: 'WEBPAGE', + timeToRead: 5, + labels: [ + { id: 'l1', resource_id: 'r1', label_id: 'la', label: { id: 'la', label_name: 'Zoo', category: 'CHILD_DEVELOPMENT' } }, + { id: 'l2', resource_id: 'r1', label_id: 'lb', label: { id: 'lb', label_name: 'Alpha', category: 'CHILD_DEVELOPMENT' } }, + ], + image_s3_key: null, + externalResources: null, + } as any; + + const txt = buildResourceEmbeddingText(resource); + expect(txt).toContain('Title: My Title'); + expect(txt).toContain('Description: A description.'); + expect(txt).toMatch(/labels?:\s*Alpha,\s*Zoo/i); + expect(txt).toMatch(/Age Groups:\s*AGE_4_6,\s*AGE_14_18/); + }); + + test('chunkText returns single chunk for short text', () => { + const short = 'short text'; + const chunks = chunkText(short, 100, 20); + expect(chunks).toHaveLength(1); + expect(chunks[0].index).toBe(0); + expect(chunks[0].content).toBe(short); + }); + + test('chunkText produces overlapping chunks and advances deterministically', () => { + // build a long text with spaces so we avoid odd splitting + const word = 'word '; + const repeat = 500; // ~500 chars + const longText = Array.from({ length: repeat }).map(() => word).join(''); + const chunkSize = 100; + const overlap = 20; + console.log("important test here"); + const chunks = chunkText(longText, chunkSize, overlap); + // console.log("chunks ", chunks[0].content.length); + expect(chunks.length).toBeGreaterThan(1); + + + // indices are sequential starting at 0 + const indices = chunks.map((c) => c.index); + for (let i = 0; i < indices.length; i++) { + expect(indices[i]).toBe(i); + } + + // verify overlap between adjacent chunks + for (let i = 0; i < chunks.length - 1; i++) { + const a = chunks[i].content; + const b = chunks[i + 1].content; + const aTail = a.slice(-overlap); + const bHead = b.slice(0, overlap); + expect(aTail).toBe(bHead); + } + }); + + test('chunkResource returns resourceId and deterministic chunks', () => { + const resource = { + id: 'res-123', + title: 'Title', + description: 'Desc', + category: 'PDF', + ageGroups: [], + language: 'ENGLISH', + resourceType: 'PDF', + timeToRead: 1, + labels: [], + image_s3_key: null, + externalResources: null, + } as any; + + const a = chunkResource(resource, 50, 10); + const b = chunkResource(resource, 50, 10); + expect(a).toEqual(b); // deterministic + expect(a.every((c) => c.resourceId === resource.id)).toBe(true); + // chunkIndex sequential 0..N-1 + for (let i = 0; i < a.length; i++) { + expect(a[i].chunkIndex).toBe(i); + } + }); +}); \ No newline at end of file