Skip to content

[FEATURE] Ergonomic Rust SDK #1667

@tomz-alt

Description

@tomz-alt

What is the use case?

An idiomatic Rust SDK for CocoIndex with the same incremental pipeline capabilities as the Python SDK — memoized computation, scoped components, file walking with fingerprinting — but using proc macros instead of decorators and explicit &Ctx instead of hidden globals.

Describe the solution you'd like

App

Entry point. Open an LMDB-backed app and run a pipeline:

let app = cocoindex::App::open("my_app", ".cocoindex_db")?;

let stats = app.run(|ctx| async move {
    // pipeline logic using ctx
    Ok(())
}).await?;

println!("{stats}");  // "processed 5, wrote 3, skipped 2 in 0.4s"
Method Signature
App::open (name: &str, db_path: &str) -> Result<App>
app.run (F: FnOnce(Ctx) -> Future<Result<T>>) -> Result<RunStats>

Ctx

Pipeline context, threaded explicitly through every function:

impl Ctx {
    /// Create a named sub-component. Scopes track what changed between runs.
    pub async fn scope(&self, key: &impl Display, f: impl FnOnce(Ctx) -> Future<Result<T>>) -> Result<T>;

    /// Memoized computation. Skips the closure if the key hasn't changed since last run.
    pub async fn memo(&self, key: &impl Serialize, f: impl FnOnce() -> Future<Result<T>>) -> Result<T>;

    /// Write a file output. CocoIndex tracks it for incremental updates.
    pub fn write_file(&self, path: impl AsRef<Path>, content: &[u8]) -> Result<()>;
}

FileEntry

Returned by fs::walk(). Lazy content, eager fingerprint — the fingerprint is used as a memo key so unchanged files skip processing entirely:

impl FileEntry {
    pub fn path(&self) -> &Path;
    pub fn relative_path(&self) -> &Path;
    pub fn stem(&self) -> &str;
    pub fn fingerprint(&self) -> impl Serialize;
    pub fn content(&self) -> Result<Vec<u8>>;
    pub fn content_str(&self) -> Result<String>;
}
let files = cocoindex::fs::walk(&dir, &["**/*.rs", "**/*.py"])?;

RunStats

pub struct RunStats {
    pub processed: u64,
    pub skipped: u64,
    pub written: u64,
    pub deleted: u64,
    pub elapsed: Duration,
}

#[cocoindex::cached]

Memoize a function by its arguments. The first &Ctx parameter is recognized automatically; remaining parameters become the cache key:

#[cocoindex::cached]
async fn analyze(ctx: &Ctx, file: &FileEntry, model: &Model) -> Result<Info> {
    // expensive work — only runs if file or model changed
}

Expands to:

async fn analyze(ctx: &Ctx, file: &FileEntry, model: &Model) -> Result<Info> {
    ctx.memo(&(file, model), {
        let file = file.clone();
        let model = model.clone();
        move || async move { /* original body */ }
    }).await
}

#[cocoindex::component]

Create a named pipeline scope. The first String/Display parameter after &Ctx becomes the scope key:

#[cocoindex::component]
async fn process_project(ctx: &Ctx, name: String, dir: PathBuf) -> Result<()> {
    // this runs inside scope keyed by `name`
}

Expands to:

async fn process_project(ctx: &Ctx, name: String, dir: PathBuf) -> Result<()> {
    ctx.scope(&name, {
        let name = name.clone();
        let dir = dir.clone();
        move |ctx| async move { /* original body */ }
    }).await
}

Additional context

Example: Multi-Codebase Summarization

Rust equivalent of the Python multi-codebase summarization example. Walks a directory of projects, extracts per-file info via LLM (memoized), aggregates per project, writes markdown.

use cocoindex::prelude::*;
use rig::providers::openai;
use schemars::JsonSchema;
use std::path::PathBuf;

#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
struct CodebaseInfo {
    name: String,
    summary: String,
    public_types: Vec<String>,
    public_functions: Vec<String>,
}

type Extractor = rig::extractor::Extractor<openai::CompletionModel, CodebaseInfo>;

fn build_extractor() -> Extractor {
    openai::Client::from_env()
        .extractor::<CodebaseInfo>(openai::GPT_4O_MINI)
        .preamble("You are a code analyst. Extract structured information about the file's public API.")
        .build()
}

#[cocoindex::cached]
async fn extract_file(ctx: &Ctx, file: &FileEntry, ext: &Extractor) -> Result<CodebaseInfo> {
    let content = file.content_str()?;
    let path = file.relative_path().display().to_string();
    ext.extract(&format!("Analyze {path}:\n```\n{content}\n```"))
        .await.map_err(|e| cocoindex::Error::engine(format!("{e}")))
}

#[cocoindex::cached]
async fn aggregate(ctx: &Ctx, name: &str, infos: &Vec<CodebaseInfo>, ext: &Extractor) -> Result<CodebaseInfo> {
    let text: String = infos.iter().map(|i| format!("- {}: {}", i.name, i.summary)).collect::<Vec<_>>().join("\n");
    ext.extract(&format!("Summarize project {name}:\n{text}"))
        .await.map_err(|e| cocoindex::Error::engine(format!("{e}")))
}

#[cocoindex::component]
async fn process_project(ctx: &Ctx, name: String, dir: PathBuf, output: PathBuf, ext: Extractor) -> Result<()> {
    let files = cocoindex::fs::walk(&dir, &["**/*.rs", "**/*.py", "**/*.ts"])?;
    let mut infos = Vec::new();
    for file in &files {
        infos.push(extract_file(&ctx, file, &ext).await?);
    }
    let summary = aggregate(&ctx, &name, &infos, &ext).await?;
    let md = format!("# {name}\n\n{}\n", summary.summary);
    ctx.write_file(output.join(format!("{name}.md")), md.as_bytes())?;
    Ok(())
}

#[tokio::main]
async fn main() -> cocoindex::Result<()> {
    let app = cocoindex::App::open("multi_codebase", ".cocoindex_db")?;
    let ext = build_extractor();

    let stats = app.run(|ctx| {
        let ext = ext.clone();
        async move {
            for entry in std::fs::read_dir("./projects")? {
                let entry = entry?;
                if !entry.file_type()?.is_dir() { continue; }
                let name = entry.file_name().to_string_lossy().to_string();
                if name.starts_with('.') { continue; }
                process_project(&ctx, name, entry.path(), "./output".into(), ext.clone()).await?;
            }
            Ok(())
        }
    }).await?;

    println!("{stats}");
    Ok(())
}
cargo run
# processed 47, wrote 4, skipped 0 in 12.3s

❤️ Contributors, please refer to 📙Contributing Guide.
Unless the PR can be sent immediately (e.g. just a few lines of code), we recommend you to leave a comment on the issue like I'm working on it or Can I work on this issue? to avoid duplicating work. Our Discord server is always open and friendly.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions