Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 33 additions & 17 deletions backend/prisma/schema.prisma
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
generator client {
provider = "prisma-client-js"
provider = "prisma-client-js"
previewFeatures = ["postgresqlExtensions"]
}

datasource db {
provider = "postgresql"
url = env("DATABASE_URL")
directUrl = env("DIRECT_URL")
provider = "postgresql"
url = env("DATABASE_URL")
directUrl = env("DIRECT_URL")
extensions = [pgvector(map: "vector", schema: "extensions")]
}

enum RESOURCE_TYPE {
Expand Down Expand Up @@ -75,7 +77,8 @@ model Resource {
externalResources ExternalResources?
collectionItems CollectionItem[]
internalResource InternalHostedResources?
image_s3_key String?
image_s3_key String?
ResourceEmbedding ResourceEmbedding?

@@map("resource")
}
Expand All @@ -84,7 +87,7 @@ model ResourceLabel {
id String @id @default(cuid())
resource_id String
label_id String
resource Resource @relation(fields: [resource_id], references: [id], onDelete:Cascade)
resource Resource @relation(fields: [resource_id], references: [id], onDelete: Cascade)
label CategoryLabel @relation(fields: [label_id], references: [id])

@@unique([resource_id, label_id])
Expand All @@ -100,6 +103,20 @@ model ExternalResources {
@@map("externalResources")
}

model ResourceEmbedding {
id String @id @default(cuid())
resource_id String @unique
embedding Unsupported("vector(1536)")
dims Int @default(1536)
chunk_text String
chunk_index Int
created_at DateTime @default(now())
updated_at DateTime @updatedAt
resource Resource @relation(fields: [resource_id], references: [id], onDelete: Cascade)

@@map("resourceEmbedding")
}

enum RELATIONSHIP_TYPE {
MOTHER
FATHER
Expand All @@ -119,24 +136,24 @@ enum HOUSEHOLD_TYPE {
}

model Parent {
id String @id @default(cuid())
clerk_id String @unique
id String @id @default(cuid())
clerk_id String @unique
first_name String?
last_name String?
email String @unique
email String @unique

relationship RELATIONSHIP_TYPE?
household_type HOUSEHOLD_TYPE?
topics_of_interest CATEGORY_TYPE[]
kids_age_groups AGE_GROUP[]
subscribed_newsletter Boolean @default(false)

created_at DateTime @default(now())
updated_at DateTime @updatedAt
onboarding_complete Boolean @default(false)
created_at DateTime @default(now())
updated_at DateTime @updatedAt
onboarding_complete Boolean @default(false)

collections Collection[]

collections Collection[]
@@map("parent")
}

Expand All @@ -147,8 +164,8 @@ model Collection {
created_at DateTime @default(now())
updated_at DateTime @updatedAt

parent Parent @relation(fields: [parent_fk], references: [id], onDelete: Cascade)
items CollectionItem[]
parent Parent @relation(fields: [parent_fk], references: [id], onDelete: Cascade)
items CollectionItem[]

@@unique([parent_fk, name])
@@map("collection")
Expand Down Expand Up @@ -176,7 +193,6 @@ model InternalHostedResources {
@@map("internalHostedResources")
}


model AdminUser {
id String @id @default(cuid())
clerk_id String @unique
Expand All @@ -203,4 +219,4 @@ model AdminLog {

@@index([admin_fk])
@@map("adminlog")
}
}
175 changes: 175 additions & 0 deletions backend/services/rag/resourceChunker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
type CategoryType =
| 'PARENTING_SKILLS_RELATIONSHIPS'
| 'CHILD_DEVELOPMENT'
| 'MENTAL_EMOTIONAL_HEALTH'
| 'SAFETY_PROTECTION'
| 'EDUCATION_LEARNING'
| 'HEALTH_WELLBEING'
| 'LIFE_SKILLS_INDEPENDENCE'
| 'FAMILY_SUPPORT_COMMUNITY';

type AgeGroup =
| 'AGE_0_3'
| 'AGE_4_6'
| 'AGE_7_10'
| 'AGE_11_13'
| 'AGE_14_18'
| 'AGE_18_ABOVE';

type Language = 'ENGLISH' | 'SPANISH' | 'OTHER';

type ResourceType =
| 'PDF'
| 'TXT'
| 'VIDEO'
| 'WEBINAR'
| 'WEBPAGE'
| 'INTERACTIVE_QUIZ'
| 'OTHER';

type CategoryLabel = {
id: string;
label_name: string;
category: CategoryType;
};

type ResourceLabel = {
id: string;
resource_id: string;
label_id: string;
label: CategoryLabel;
};

type ExternalResources = {
external_url: string;
};

type Resource = {
id: string;
title: string;
description?: string | null;
category: CategoryType;
ageGroups: AgeGroup[];
language: Language;
resourceType: ResourceType;
timeToRead: number;
labels: ResourceLabel[];
image_s3_key?: string | null;
externalResources?: ExternalResources | null;
};

type Chunk = {
index: number;
content: string;
}

type ChunkedResource = {
resourceId: string;
chunkIndex: number;
content: string;
}

const AGE_GROUP_ORDER: AgeGroup[] = [
'AGE_0_3',
'AGE_4_6',
'AGE_7_10',
'AGE_11_13',
'AGE_14_18',
'AGE_18_ABOVE',
];

// Based on ordered list, returns a corresponding comparator
function orderComparator<T extends string>(order: T[]) {
const index = new Map<T, number>();
order.forEach((v, i) => index.set(v, i));
return (a: T, b: T) => {
const ia = index.has(a) ? index.get(a)! : Number.POSITIVE_INFINITY;
const ib = index.has(b) ? index.get(b)! : Number.POSITIVE_INFINITY;
if (ia !== ib) return ia - ib;
return a.localeCompare(b);
};
}

const compareAgeGroups = orderComparator(AGE_GROUP_ORDER);

/**
* Builds resource embedding text
*
* @param resource a Resource object as described in prisma schema
* @returns string -> The embedding text
*/
export function buildResourceEmbeddingText(resource: Resource) : string {
const title = resource.title.trim();
const description = resource.description ? resource.description.trim() : '';
const category = resource.category;
const resourceType = resource.resourceType;
const language = resource.language;
const timeToRead = String(resource.timeToRead);
const ageGroups = Array.isArray(resource.ageGroups) ? [...resource.ageGroups].sort(compareAgeGroups) : [];
const labels =
Array.isArray(resource.labels) && resource.labels.length
? resource.labels.map((rl) => (rl.label?.label_name || '').trim()).filter(Boolean).sort()
: [];

const externalUrl = resource.externalResources?.external_url || '';

const parts: string[] = [];
parts.push('=== Resource ===');
parts.push(`Title: ${title}`);
if (description) parts.push(`Description: ${description}`);
parts.push(`Category: ${category}`);
parts.push(`Age Groups: ${ageGroups.join(', ')}`);
parts.push(`Language: ${language}`);
parts.push(`Resource Type: ${resourceType}`);
parts.push(`Time to Read: ${timeToRead} minutes `);
parts.push(`labels: ${labels.join(', ')}`);
if (externalUrl) parts.push(`External URL: ${externalUrl}`);
parts.push('=== End ===');

return parts.join('\n');
}

/**
* Performs fixed character-based chunking on given test
*
* @param text The text to chunk
* @param chunkSize The number of characters each chunk is maximum
* @param overlap The amount of overlap between chunks
* @returns List of chunk objects containing the index of chunk and content
*/
export function chunkText(text: string, chunkSize = 700, overlap = 100): Chunk[] {
if (typeof text !== 'string' || !text) return [];
const size = Math.max(1, Math.floor(chunkSize));
const ov = Math.max(0, Math.floor(overlap));

const chunks: Chunk[] = [];
const step = size - ov > 0 ? size - ov : 1;

let index = 0;
for (let i = 0; i < text.length; i += step) {
chunks.push({ index: index++, content: text.slice(i, i + size) });
}

return chunks;
}

/**
* Chunks a given resource object. Resource objects are based on resources
* as described in the prisma schema.
*
* @param resource The resource (described in prisma schema) to chunk
* @param chunkSize Size of chunk
* @param overlap Size of overlap between chunks
* @returns a list of ChunkedResource objects containing id of resource, chunk index, and content.
*/
export function chunkResource(resource: Resource, chunkSize=700, overlap=100) : ChunkedResource[] {
const embeddingText = buildResourceEmbeddingText(resource);
const chunks: Chunk[] = chunkText(embeddingText, chunkSize, overlap);
const id = resource.id;
const chunkedResources: ChunkedResource[] = [];
for (const chunk of chunks) {
chunkedResources.push({resourceId: id, chunkIndex: chunk.index, content: chunk.content});
}

return chunkedResources;
}
90 changes: 90 additions & 0 deletions backend/tests/resourceChunker.tests.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { buildResourceEmbeddingText, chunkText, chunkResource } from '../services/rag/resourceChunker';

describe('resourceChunker', () => {
test('buildResourceEmbeddingText includes fields and sorts labels & ageGroups', () => {
const resource = {
id: 'r1',
title: ' My Title ',
description: ' A description. ',
category: 'CHILD_DEVELOPMENT',
ageGroups: ['AGE_14_18', 'AGE_4_6'],
language: 'ENGLISH',
resourceType: 'WEBPAGE',
timeToRead: 5,
labels: [
{ id: 'l1', resource_id: 'r1', label_id: 'la', label: { id: 'la', label_name: 'Zoo', category: 'CHILD_DEVELOPMENT' } },
{ id: 'l2', resource_id: 'r1', label_id: 'lb', label: { id: 'lb', label_name: 'Alpha', category: 'CHILD_DEVELOPMENT' } },
],
image_s3_key: null,
externalResources: null,
} as any;

const txt = buildResourceEmbeddingText(resource);
expect(txt).toContain('Title: My Title');
expect(txt).toContain('Description: A description.');
expect(txt).toMatch(/labels?:\s*Alpha,\s*Zoo/i);
expect(txt).toMatch(/Age Groups:\s*AGE_4_6,\s*AGE_14_18/);
});

test('chunkText returns single chunk for short text', () => {
const short = 'short text';
const chunks = chunkText(short, 100, 20);
expect(chunks).toHaveLength(1);
expect(chunks[0].index).toBe(0);
expect(chunks[0].content).toBe(short);
});

test('chunkText produces overlapping chunks and advances deterministically', () => {
// build a long text with spaces so we avoid odd splitting
const word = 'word ';
const repeat = 500; // ~500 chars
const longText = Array.from({ length: repeat }).map(() => word).join('');
const chunkSize = 100;
const overlap = 20;
console.log("important test here");
const chunks = chunkText(longText, chunkSize, overlap);
// console.log("chunks ", chunks[0].content.length);
expect(chunks.length).toBeGreaterThan(1);


// indices are sequential starting at 0
const indices = chunks.map((c) => c.index);
for (let i = 0; i < indices.length; i++) {
expect(indices[i]).toBe(i);
}

// verify overlap between adjacent chunks
for (let i = 0; i < chunks.length - 1; i++) {
const a = chunks[i].content;
const b = chunks[i + 1].content;
const aTail = a.slice(-overlap);
const bHead = b.slice(0, overlap);
expect(aTail).toBe(bHead);
}
});

test('chunkResource returns resourceId and deterministic chunks', () => {
const resource = {
id: 'res-123',
title: 'Title',
description: 'Desc',
category: 'PDF',
ageGroups: [],
language: 'ENGLISH',
resourceType: 'PDF',
timeToRead: 1,
labels: [],
image_s3_key: null,
externalResources: null,
} as any;

const a = chunkResource(resource, 50, 10);
const b = chunkResource(resource, 50, 10);
expect(a).toEqual(b); // deterministic
expect(a.every((c) => c.resourceId === resource.id)).toBe(true);
// chunkIndex sequential 0..N-1
for (let i = 0; i < a.length; i++) {
expect(a[i].chunkIndex).toBe(i);
}
});
});
Loading