Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 4 additions & 15 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ task examples:all

Requires Task, Docker, and LLM credentials — see [examples/README.md](examples/README.md).

If you change **agent behavior** (e.g. `pkg/agent`, telemetry, tools, runtime) or **`eval-harness/`**, run:
If you change **agent behavior** (e.g. `pkg/agent`, `pkg/memory`, telemetry, tools, runtime) or **`eval-harness/`**, run:

```bash
make eval-harness
Expand Down Expand Up @@ -136,7 +136,7 @@ Or run a single example:
go run ./examples/simple_agent "Hello"
```

See [examples/README.md](examples/README.md) for all examples, env vars, Task install, and infra commands (`task infra:*`, `task examples:local`).
See [examples/README.md](examples/README.md) for all examples, env vars, Task install, and infra commands (`task infra:*`, `task examples:local`). Memory examples (`examples/agent_with_memory/`) need Weaviate or pgvector — see [examples/agent_with_memory/README.md](examples/agent_with_memory/README.md).

## Ways to Contribute

Expand Down Expand Up @@ -180,7 +180,7 @@ Using the SDK and ran into issues, unclear docs, or confusing behavior? **Raise
2. **Tests**
- Add tests for new features and bug fixes.
- Unit tests go in `*_test.go` files alongside the code.
- Agent behavior changes (`pkg/agent`, telemetry, tools, runtime) or **`eval-harness/`** edits — run `make eval-harness` before submitting a PR.
- Agent behavior changes (`pkg/agent`, `pkg/memory`, telemetry, tools, runtime) or **`eval-harness/`** edits — run `make eval-harness` before submitting a PR.

3. **Commits**
- Use [conventional commits](https://www.conventionalcommits.org) — these drive the release changelog:
Expand All @@ -201,18 +201,7 @@ Using the SDK and ran into issues, unclear docs, or confusing behavior? **Raise
- Keep changes focused. For larger work, consider splitting into multiple PRs.
- For new LLM providers: implement `interfaces.LLMClient` (see `pkg/interfaces/llm.go` and existing providers in `pkg/llm/`).
- For new tools: implement `interfaces.Tool` (see `pkg/interfaces/tools.go` and `pkg/tools/`).

## Project Layout

| Path | Purpose |
|------|---------|
| `pkg/agent/` | Agent core, workflow, config |
| `pkg/llm/` | LLM providers (OpenAI, Anthropic, Gemini) |
| `pkg/interfaces/` | Interfaces for LLM clients, tools, messages |
| `pkg/tools/` | Built-in and custom tools |
| `pkg/conversation/` | Message history (in-memory, Redis) |
| `cmd/` | agentctl CLI |
| `examples/` | Example programs |
- For new memory backends: implement `interfaces.Memory` (see `pkg/interfaces/memory.go` and `pkg/memory/weaviate` or `pkg/memory/pgvector`).

## Releasing (maintainers only)

Expand Down
468 changes: 310 additions & 158 deletions README.md

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Mock components apply configurable latency and jitter so results reflect realist
- Process CPU time
- Total input/output tokens (from mock LLM stats; includes sub-agent LLM calls)
- Success rate (`Run()` completed without error)
- Long-term memory recalls/stores (when `memory.enabled: true`; from run telemetry)
- `est_cost_usd` — placeholder `0` until pricing is configured

Reports are written to `benchmarks/reports/` (JSON or text). SDK logs (optional) go to `benchmarks/logs/`.
Expand Down Expand Up @@ -205,6 +206,18 @@ All paths in config (`dir` fields) are relative to the **repository root** unles
| `subagents.count` | Sub-agents per level (0 to disable). |
| `subagents.levels` | Max sub-agent nesting depth (1–5). |

### `memory`

Long-term memory (`agent.WithMemory`) using an in-process inmem backend (no Docker). Disabled by default.

| Field | Description |
| :--- | :--- |
| `enabled` | `true` wires recall before each run and store after (mode-dependent). |
| `store_mode` | `ondemand` (LLM `save_memory` tool) or `always` (extract at run end). |
| `user_id` | Scope user ID passed via `memory.WithContextUserID` (default `benchmark-user`). |

When `memory.enabled: true`, `agent.tools.count` may be `0` (memory-only runs). The mock LLM handles `save_memory` tool args and memory-extract JSON like the eval harness.

### `logger`

| Field | Description |
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/agent_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ type AgentBundle struct {

func buildAgentBundle(cfg *setup.Config, llm *setup.MockLLMClient, lgr logger.Logger, tree *setup.AgentTree) (*AgentBundle, error) {
enableRemote := cfg.ExternalWorkersEnabled()
opts := setup.RootOptions(cfg, llm, lgr, setup.RootAgentName, tree.RootPrompt, tree.SubAgents, cfg.Temporal.TaskQueue, enableRemote)
opts, err := setup.AppendMemoryOptions(cfg, setup.RootOptions(cfg, llm, lgr, setup.RootAgentName, tree.RootPrompt, tree.SubAgents, cfg.Temporal.TaskQueue, enableRemote))
if err != nil {
return nil, err
}

root, err := agent.NewAgent(opts...)
if err != nil {
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ agent:
count: 2
levels: 1 # 1 or 2

memory:
enabled: false
store_mode: ondemand # ondemand or always
user_id: benchmark-user

logger:
enabled: false # true writes SDK logs under benchmarks/logs
dir: benchmarks/logs
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ type BenchmarkMetrics struct {

TotalRuns int `json:"total_runs"`
SuccessRate float64 `json:"success_rate"`

TotalMemoryRecalls int64 `json:"total_memory_recalls"`
TotalMemoryStores int64 `json:"total_memory_stores"`
}

func main() {
Expand Down Expand Up @@ -67,6 +70,9 @@ func main() {
fmt.Printf("Starting agent-sdk-go benchmark (%s runtime)\n", cfg.Runtime)
fmt.Printf("Runs: %d Concurrent: %t Tools: %d Sub-agents: %d (levels %d)\n",
cfg.Agent.Runs, cfg.Agent.Concurrent, cfg.Agent.Tools.Count, cfg.Agent.Subagents.Count, cfg.Agent.Subagents.Levels)
if cfg.MemoryEnabled() {
fmt.Printf("Memory : enabled (store_mode=%s, user_id=%s)\n", cfg.Memory.StoreMode, cfg.Memory.UserID)
}
if cfg.UseTemporal() {
fmt.Printf("External workers : %d\n", cfg.Temporal.WorkersCount)
}
Expand Down
29 changes: 17 additions & 12 deletions benchmarks/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@ import (
func aggregateMetrics(outcomes []runOutcome, memBefore, memAfter runtime.MemStats, cpuMs float64, inputTokens, outputTokens int) *BenchmarkMetrics {
latencies := make([]float64, 0, len(outcomes))
successes := 0
var totalRecalls, totalStores int64
for _, o := range outcomes {
latencies = append(latencies, o.latencyMs)
if o.success {
successes++
}
totalRecalls += o.memoryRecalls
totalStores += o.memoryStores
}
sort.Float64s(latencies)

Expand All @@ -24,18 +27,20 @@ func aggregateMetrics(outcomes []runOutcome, memBefore, memAfter runtime.MemStat
}

return &BenchmarkMetrics{
P50Ms: percentile(latencies, 50),
P95Ms: percentile(latencies, 95),
P99Ms: percentile(latencies, 99),
AvgMs: average(latencies),
HeapAllocBytes: deltaUint64(memAfter.Alloc, memBefore.Alloc),
TotalAllocBytes: deltaUint64(memAfter.TotalAlloc, memBefore.TotalAlloc),
CPUTimeMs: cpuMs,
TotalInputTokens: inputTokens,
TotalOutputTokens: outputTokens,
EstCostUSD: 0, // pricing to be defined later
TotalRuns: totalRuns,
SuccessRate: successRate,
P50Ms: percentile(latencies, 50),
P95Ms: percentile(latencies, 95),
P99Ms: percentile(latencies, 99),
AvgMs: average(latencies),
HeapAllocBytes: deltaUint64(memAfter.Alloc, memBefore.Alloc),
TotalAllocBytes: deltaUint64(memAfter.TotalAlloc, memBefore.TotalAlloc),
CPUTimeMs: cpuMs,
TotalInputTokens: inputTokens,
TotalOutputTokens: outputTokens,
EstCostUSD: 0, // pricing to be defined later
TotalRuns: totalRuns,
SuccessRate: successRate,
TotalMemoryRecalls: totalRecalls,
TotalMemoryStores: totalStores,
}
}

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,9 @@ func formatTextReport(cfg *setup.Config, metrics *BenchmarkMetrics) string {
fmt.Fprintf(&b, "Output tokens : %d\n", metrics.TotalOutputTokens)
fmt.Fprintf(&b, "Est. cost (USD) : %.4f # pricing placeholder\n", metrics.EstCostUSD)
fmt.Fprintf(&b, "Success rate (%%) : %.2f\n", metrics.SuccessRate)
if cfg.MemoryEnabled() {
fmt.Fprintf(&b, "Memory recalls : %d\n", metrics.TotalMemoryRecalls)
fmt.Fprintf(&b, "Memory stores : %d\n", metrics.TotalMemoryStores)
}
return b.String()
}
24 changes: 18 additions & 6 deletions benchmarks/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ import (
"github.com/agenticenv/agent-sdk-go/benchmarks/setup"
"github.com/agenticenv/agent-sdk-go/pkg/agent"
"github.com/agenticenv/agent-sdk-go/pkg/logger"
"github.com/agenticenv/agent-sdk-go/pkg/memory"
)

type runOutcome struct {
latencyMs float64
success bool
latencyMs float64
success bool
memoryRecalls int64
memoryStores int64
}

func runBenchmark(ctx context.Context, cfg *setup.Config, llm *setup.MockLLMClient, lgr logger.Logger, runRng *rand.Rand) (*BenchmarkMetrics, error) {
Expand Down Expand Up @@ -64,7 +67,7 @@ func runBenchmark(ctx context.Context, cfg *setup.Config, llm *setup.MockLLMClie
agentIdx := i % len(bundles)
go func(bundle *AgentBundle) {
defer wg.Done()
outcome := executeRun(ctx, bundle.Root, runRng)
outcome := executeRun(ctx, cfg, bundle.Root, runRng)
outcomesMu.Lock()
outcomes = append(outcomes, outcome)
outcomesMu.Unlock()
Expand All @@ -89,13 +92,22 @@ func runBenchmark(ctx context.Context, cfg *setup.Config, llm *setup.MockLLMClie
return aggregateMetrics(outcomes, memBefore, memAfter, cpuAfter-cpuBefore, inputTokens, outputTokens), nil
}

func executeRun(ctx context.Context, a *agent.Agent, rng *rand.Rand) runOutcome {
func executeRun(ctx context.Context, cfg *setup.Config, a *agent.Agent, rng *rand.Rand) runOutcome {
runCtx := ctx
if cfg.MemoryEnabled() {
runCtx = memory.WithContextUserID(ctx, cfg.Memory.UserID)
}
start := time.Now()
_, err := a.Run(ctx, setup.RandomUserPrompt(rng), nil)
return runOutcome{
result, err := a.Run(runCtx, setup.RandomUserPrompt(rng), nil)
outcome := runOutcome{
latencyMs: float64(time.Since(start).Milliseconds()),
success: err == nil,
}
if result != nil && result.Telemetry != nil {
outcome.memoryRecalls = result.Telemetry.Storage.TotalMemoryRecalls
outcome.memoryStores = result.Telemetry.Storage.TotalMemoryStores
}
return outcome
}

func processCPUTimeMs() (float64, error) {
Expand Down
66 changes: 64 additions & 2 deletions benchmarks/setup/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,22 @@ import (
"path/filepath"
"strings"

testutil "github.com/agenticenv/agent-sdk-go/internal/testing"
"github.com/agenticenv/agent-sdk-go/pkg/agent"
"github.com/agenticenv/agent-sdk-go/pkg/memory"
"github.com/spf13/viper"
)

const BenchmarkTreeSeed int64 = 42
const defaultMemoryUserID = "benchmark-user"

type Config struct {
Runtime string `mapstructure:"runtime"`
Temporal TemporalConfig `mapstructure:"temporal"`
LLM LLMConfig `mapstructure:"llm"`
Tool ToolConfig `mapstructure:"tool"`
Agent AgentConfig `mapstructure:"agent"`
Memory MemoryConfig `mapstructure:"memory"`
Logger LoggerConfig `mapstructure:"logger"`
Output OutputConfig `mapstructure:"output"`
}
Expand Down Expand Up @@ -58,6 +63,13 @@ type SubagentsConfig struct {
Levels int `mapstructure:"levels"`
}

// MemoryConfig configures long-term memory for benchmark runs.
type MemoryConfig struct {
Enabled bool `mapstructure:"enabled"`
StoreMode string `mapstructure:"store_mode"`
UserID string `mapstructure:"user_id"`
}

type LoggerConfig struct {
Enabled bool `mapstructure:"enabled"`
Dir string `mapstructure:"dir"`
Expand All @@ -79,6 +91,50 @@ func (c *Config) ExternalWorkersEnabled() bool {
return c.UseTemporal() && c.Temporal.WorkersCount > 0
}

// MemoryEnabled reports whether long-term memory is wired for benchmark runs.
func (c *Config) MemoryEnabled() bool {
return c != nil && c.Memory.Enabled
}

func (m *MemoryConfig) applyDefaults() {
if m == nil {
return
}
if strings.TrimSpace(m.UserID) == "" {
m.UserID = defaultMemoryUserID
}
if strings.TrimSpace(m.StoreMode) == "" {
m.StoreMode = string(memory.StoreModeOnDemand)
}
}

func parseMemoryStoreMode(raw string) (memory.StoreMode, error) {
switch strings.ToLower(strings.TrimSpace(raw)) {
case "", string(memory.StoreModeOnDemand), "on-demand", "on_demand":
return memory.StoreModeOnDemand, nil
case string(memory.StoreModeAlways):
return memory.StoreModeAlways, nil
default:
return "", fmt.Errorf("memory.store_mode must be %q or %q", memory.StoreModeOnDemand, memory.StoreModeAlways)
}
}

// MemoryAgentOption returns WithMemory when memory is enabled.
func MemoryAgentOption(cfg *Config) (agent.Option, error) {
if cfg == nil || !cfg.MemoryEnabled() {
return nil, nil
}
cfg.Memory.applyDefaults()
mode, err := parseMemoryStoreMode(cfg.Memory.StoreMode)
if err != nil {
return nil, err
}
memCfg := memory.DefaultConfig(testutil.NewInmemMemory())
memCfg.Store.Mode = mode
memCfg.Recall.Enabled = true
return agent.WithMemory(memCfg), nil
}

func LoadConfig(path string) (*Config, error) {
if path == "" {
path = defaultConfigPath()
Expand Down Expand Up @@ -106,8 +162,8 @@ func (c *Config) validate() error {
if c.Agent.Concurrent && c.Agent.ConcurrentCount <= 0 {
return fmt.Errorf("agent.concurrent_count must be > 0 when concurrent is true")
}
if c.Agent.Tools.Count <= 0 {
return fmt.Errorf("agent.tools.count must be > 0")
if c.Agent.Tools.Count <= 0 && !c.Memory.Enabled {
return fmt.Errorf("agent.tools.count must be > 0 when memory is disabled")
}
if c.Agent.Subagents.Levels < 0 {
return fmt.Errorf("agent.subagents.levels must be >= 0")
Expand Down Expand Up @@ -148,6 +204,12 @@ func (c *Config) validate() error {
if c.Temporal.Namespace == "" {
c.Temporal.Namespace = "default"
}
c.Memory.applyDefaults()
if c.Memory.Enabled {
if _, err := parseMemoryStoreMode(c.Memory.StoreMode); err != nil {
return err
}
}
return nil
}

Expand Down
28 changes: 28 additions & 0 deletions benchmarks/setup/mock_llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@ import (
"time"

"github.com/agenticenv/agent-sdk-go/internal/runtime"
"github.com/agenticenv/agent-sdk-go/internal/types"
"github.com/agenticenv/agent-sdk-go/pkg/interfaces"
)

const MockLLMModel = "benchmark-mock"

const mockMemoryExtractText = "User prefers concise answers"

type LLMStats struct {
mu sync.Mutex
TotalInputTokens int
Expand Down Expand Up @@ -64,6 +67,17 @@ func (m *MockLLMClient) Generate(ctx context.Context, request *interfaces.LLMReq
promptTokens, completionTokens := splitMockTokens(m.cfg.MockTokens)
m.stats.add(promptTokens, completionTokens)

if isMemoryExtractRequest(request) {
return &interfaces.LLMResponse{
Content: fmt.Sprintf(`{"memories":[{"text":%q,"kind":"preference"}]}`, mockMemoryExtractText),
Usage: &interfaces.LLMUsage{
PromptTokens: int64(promptTokens),
CompletionTokens: int64(completionTokens),
TotalTokens: int64(promptTokens + completionTokens),
},
}, nil
}

if hasToolResultMessages(request) {
return &interfaces.LLMResponse{
Content: "benchmark complete",
Expand Down Expand Up @@ -149,12 +163,26 @@ func hasToolResultMessages(request *interfaces.LLMRequest) bool {
}

func mockToolArgs(toolName string) map[string]any {
if toolName == types.SaveMemoryToolName {
return map[string]any{
types.MemoryToolParamText: mockMemoryExtractText,
types.MemoryToolParamKind: "preference",
}
}
if strings.HasPrefix(toolName, "subagent_") {
return map[string]any{runtime.SubAgentToolParamQuery: "benchmark subtask"}
}
return map[string]any{"input": "benchmark"}
}

func isMemoryExtractRequest(request *interfaces.LLMRequest) bool {
if request == nil || request.ResponseFormat == nil {
return false
}
return request.ResponseFormat.Type == interfaces.ResponseFormatJSON &&
request.ResponseFormat.Name == "MemoryExtraction"
}

func splitMockTokens(total int) (prompt, completion int) {
if total <= 0 {
return 0, 0
Expand Down
Loading
Loading