diff --git a/packages/learning/README.md b/packages/learning/README.md index f67ade7..a3dbcc4 100644 --- a/packages/learning/README.md +++ b/packages/learning/README.md @@ -61,6 +61,41 @@ console.log(diagnostics.bestPolicy?.weights) `LinearPolicySearchLearner` is a small non-neural continuous-control baseline. It learns a linear observation-to-action policy with seeded candidate perturbations, optional label-based warm starts, elite averaging, bounded actions and JSON checkpoints. It also exposes numeric learner metrics such as candidate index, population rewards, improvement flags and policy weight norms, plus label-aware diagnostics for inspecting the current, mean and best policy matrices. It is meant to prove the continuous learner/checkpoint/debugging path before PPO/SAC or native tensor backends exist. +## Configuration and Metrics + +Learner hyperparameters are normalized through exported config helpers: + +```ts +import { + defineTabularQConfig, + learnerMetricSpecsForAlgorithm, +} from "@ignitionrl/learning" + +const learnerConfig = defineTabularQConfig({ + epsilon: 0.2, + learningRate: 0.3, +}) + +const metrics = learnerMetricSpecsForAlgorithm("tabular-q-learning") +``` + +Default baseline configs are: + +- `tabular-q-learning`: `learningRate: 0.2`, `discount: 0.95`, `epsilon: 0.1`, `initialQ: 0`, `observationPrecision: 2`, `seed: 0`; +- `linear-policy-search`: `sigma: 0.2`, `actionNoise: 0.03`, `initialWeightScale: 0.05`, `populationSize: 6`, `eliteCount: 2`, `seed: 0`. + +Default neural adapter cadences are: + +- `dqn`: step updates every step after `warmupSteps: 1000`, with `batchSize: 64`; +- `ppo`: rollout updates every `2048` steps, with `epochs: 10` and `minibatches: 32`; +- `sac`: step updates every step after `warmupSteps: 1000`, with `batchSize: 64`. + +Network, optimizer and replay-buffer hyperparameters are intentionally left to the future learner implementation while the adapter contract keeps their metrics and checkpoints stable. + +Invalid hyperparameters throw during config normalization or learner initialization, before a Studio/SDK learner run is created. Learners also expose `getConfig()`, so `@ignitionrl/sdk` can persist the effective JSON config under `run.config.learnerConfig` for reproducibility. + +Metric catalogs use the persisted run metric names that Studio panels and CI gates should depend on. Episode metrics are `totalReward`, `length`, `success`, `terminated` and `truncated`. Learner metrics are namespaced as `learner.*`, for example `learner.lastTdError`, `learner.epsilon` or `learner.tdLoss`. + ## Checkpoints ```ts diff --git a/packages/learning/src/index.ts b/packages/learning/src/index.ts index 39a8dcf..8561f0c 100644 --- a/packages/learning/src/index.ts +++ b/packages/learning/src/index.ts @@ -11,10 +11,12 @@ export { export { LINEAR_POLICY_SEARCH_ALGORITHM, LINEAR_POLICY_SEARCH_CHECKPOINT_VERSION, + DEFAULT_LINEAR_POLICY_SEARCH_CONFIG, LinearPolicySearchLearner, assertCheckpointMatchesContinuousSpec, assertLinearPolicySearchCheckpoint, createLinearPolicySearchLearner, + defineLinearPolicySearchConfig, normalizeLinearPolicySearchCheckpoint, type LinearPolicySearchActOptions, type LinearPolicySearchCandidateSummary, @@ -28,12 +30,29 @@ export { type LinearPolicySearchPolicyWeights, } from "./linear-policy-search.js"; export { + DEFAULT_TABULAR_Q_CONFIG, TabularQLearner, createTabularQLearner, + defineTabularQConfig, transitionFromStep, type SelectActionOptions, type TabularQOptions, } from "./tabular-q.js"; +export { + EPISODE_METRIC_SPECS, + LINEAR_POLICY_SEARCH_METRIC_SPECS, + TABULAR_Q_METRIC_SPECS, + assertLearnerMetricSpecs, + learnerMetricSpecsForAlgorithm, + learnerMetricSpecsFromNeuralAdapterContract, + stableMetricNamesForAlgorithm, + stableMetricNamesFromNeuralAdapterContract, + type LearnerMetricCatalogOptions, + type LearnerMetricDirection, + type LearnerMetricReducer, + type LearnerMetricScope, + type LearnerMetricSpec, +} from "./metrics.js"; export { NEURAL_ADAPTER_CONTRACT_VERSION, NEURAL_CHECKPOINT_SCHEMA_VERSION, diff --git a/packages/learning/src/linear-policy-search.ts b/packages/learning/src/linear-policy-search.ts index ebc7da3..c11785b 100644 --- a/packages/learning/src/linear-policy-search.ts +++ b/packages/learning/src/linear-policy-search.ts @@ -108,14 +108,14 @@ type LinearPolicySearchMetricDefaults = { readonly bestWeights?: readonly number[]; }; -const DEFAULT_CONFIG: LinearPolicySearchConfig = { +export const DEFAULT_LINEAR_POLICY_SEARCH_CONFIG: LinearPolicySearchConfig = Object.freeze({ seed: 0, sigma: 0.2, actionNoise: 0.03, initialWeightScale: 0.05, populationSize: 6, eliteCount: 2, -}; +}); type Candidate = { readonly weights: readonly number[]; @@ -158,13 +158,13 @@ export class LinearPolicySearchLearner implements Learner { private lastImproved = 0; constructor(options: LinearPolicySearchOptions = {}) { - this.config = normalizeConfig(options); + this.config = defineLinearPolicySearchConfig(options); this.metadata = options.metadata; this.rng = createSeededRng(this.config.seed); } async init(spec: EnvironmentSpec, config: LearnerConfig = {}): Promise { - const nextConfig = normalizeConfig({ + const nextConfig = defineLinearPolicySearchConfig({ ...this.config, ...config, }); @@ -327,6 +327,10 @@ export class LinearPolicySearchLearner implements Learner { }; } + getConfig(): LinearPolicySearchConfig { + return cloneLinearPolicySearchConfig(this.config); + } + getPolicyWeights( policy: LinearPolicySearchPolicyKind = "best", ): LinearPolicySearchPolicyWeights { @@ -481,6 +485,12 @@ export function createLinearPolicySearchLearner( return new LinearPolicySearchLearner(options); } +export function defineLinearPolicySearchConfig( + options: Partial = {}, +): LinearPolicySearchConfig { + return normalizeConfig(options); +} + export function normalizeLinearPolicySearchCheckpoint( value: unknown, ): LinearPolicySearchCheckpoint { @@ -599,8 +609,14 @@ export function assertCheckpointMatchesContinuousSpec( function normalizeConfig(value: Partial): LinearPolicySearchConfig { const config = { - ...DEFAULT_CONFIG, - ...value, + seed: value.seed ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.seed, + sigma: value.sigma ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.sigma, + actionNoise: value.actionNoise ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.actionNoise, + initialWeightScale: value.initialWeightScale + ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.initialWeightScale, + populationSize: value.populationSize + ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.populationSize, + eliteCount: value.eliteCount ?? DEFAULT_LINEAR_POLICY_SEARCH_CONFIG.eliteCount, ...(value.initialWeightMap !== undefined ? { initialWeightMap: cloneInitialWeightMap(value.initialWeightMap) } : {}), @@ -611,6 +627,17 @@ function normalizeConfig(value: Partial): LinearPolicy return config; } +function cloneLinearPolicySearchConfig( + config: LinearPolicySearchConfig, +): LinearPolicySearchConfig { + return { + ...config, + ...(config.initialWeightMap !== undefined + ? { initialWeightMap: cloneInitialWeightMap(config.initialWeightMap) } + : {}), + }; +} + function assertLinearPolicySearchConfig( value: unknown, ): asserts value is LinearPolicySearchConfig { diff --git a/packages/learning/src/metrics.ts b/packages/learning/src/metrics.ts new file mode 100644 index 0000000..f6647aa --- /dev/null +++ b/packages/learning/src/metrics.ts @@ -0,0 +1,372 @@ +import { + TABULAR_Q_ALGORITHM, + type TabularQMetrics, +} from "./checkpoint.js"; +import { + LINEAR_POLICY_SEARCH_ALGORITHM, + type LinearPolicySearchMetrics, +} from "./linear-policy-search.js"; +import { + defaultMetricSpecs, + type NeuralLearnerAdapterContract, + type NeuralMetricSpec, +} from "./neural-adapter.js"; + +export type LearnerMetricScope = "episode" | "update" | "run"; +export type LearnerMetricDirection = "maximize" | "minimize" | "none"; +export type LearnerMetricReducer = "last" | "mean" | "min" | "max" | "sum"; + +export type LearnerMetricSpec = { + readonly name: string; + readonly scope: LearnerMetricScope; + readonly direction: LearnerMetricDirection; + readonly reducer: LearnerMetricReducer; + readonly unit?: string; + readonly description?: string; +}; + +export type LearnerMetricCatalogOptions = { + readonly includeEpisodeMetrics?: boolean; +}; + +export const EPISODE_METRIC_SPECS: readonly LearnerMetricSpec[] = Object.freeze([ + { + name: "totalReward", + scope: "episode", + direction: "maximize", + reducer: "mean", + description: "Episode total reward emitted by the environment runner.", + }, + { + name: "length", + scope: "episode", + direction: "minimize", + reducer: "mean", + unit: "steps", + description: "Number of environment steps in the episode.", + }, + { + name: "success", + scope: "episode", + direction: "maximize", + reducer: "mean", + description: "1 when the episode ended successfully, otherwise 0.", + }, + { + name: "terminated", + scope: "episode", + direction: "none", + reducer: "sum", + description: "1 when the environment termination condition fired.", + }, + { + name: "truncated", + scope: "episode", + direction: "none", + reducer: "sum", + description: "1 when the episode ended because of a step cap or cancellation.", + }, +]); + +export const TABULAR_Q_METRIC_SPECS: readonly LearnerMetricSpec[] = Object.freeze([ + learnerMetric( + "states", + "run", + "none", + "last", + "Number of discretized observation states discovered in the Q table.", + ), + learnerMetric( + "transitions", + "run", + "none", + "last", + "Number of transitions observed by the learner.", + ), + learnerMetric( + "episodes", + "run", + "none", + "last", + "Number of completed training episodes observed by the learner.", + ), + learnerMetric( + "lastTdError", + "update", + "minimize", + "last", + "Most recent temporal-difference error.", + ), +]); + +export const LINEAR_POLICY_SEARCH_METRIC_SPECS: readonly LearnerMetricSpec[] = Object.freeze([ + learnerMetric( + "transitions", + "run", + "none", + "last", + "Number of transitions observed by the learner.", + ), + learnerMetric( + "episodes", + "run", + "none", + "last", + "Number of completed training episodes observed by the learner.", + ), + learnerMetric( + "iterations", + "update", + "none", + "last", + "Number of finished population updates.", + ), + learnerMetric( + "bestReward", + "run", + "maximize", + "max", + "Best candidate episode reward seen so far.", + ), + learnerMetric( + "lastEpisodeReward", + "episode", + "maximize", + "last", + "Reward from the latest candidate episode.", + ), + learnerMetric( + "lastMeanEliteReward", + "update", + "maximize", + "last", + "Mean reward of the elite candidates from the latest completed population.", + ), + learnerMetric( + "lastPopulationBestReward", + "update", + "maximize", + "last", + "Best reward in the latest completed population.", + ), + learnerMetric( + "lastPopulationWorstReward", + "update", + "minimize", + "last", + "Worst reward in the latest completed population.", + ), + learnerMetric( + "lastPopulationMeanReward", + "update", + "maximize", + "last", + "Mean reward in the latest completed population.", + ), + learnerMetric( + "lastPopulationRewardStdDev", + "update", + "none", + "last", + "Reward standard deviation in the latest completed population.", + ), + learnerMetric( + "sigma", + "run", + "none", + "last", + "Current candidate sampling standard deviation.", + ), + learnerMetric( + "improvements", + "run", + "maximize", + "last", + "Number of times a new best policy was found.", + ), + learnerMetric( + "lastImproved", + "episode", + "maximize", + "last", + "1 when the latest candidate improved the best reward.", + ), + learnerMetric( + "candidateIndex", + "episode", + "none", + "last", + "Current candidate index within the active population.", + ), + learnerMetric( + "populationFilled", + "episode", + "none", + "last", + "Number of evaluated candidates in the active population.", + ), + learnerMetric( + "meanWeightNorm", + "update", + "none", + "last", + "L2 norm of the mean policy weights.", + ), + learnerMetric( + "currentWeightNorm", + "episode", + "none", + "last", + "L2 norm of the active candidate weights.", + ), + learnerMetric( + "bestWeightNorm", + "run", + "none", + "last", + "L2 norm of the best policy weights.", + ), +]); + +export function learnerMetricSpecsForAlgorithm( + algorithm: string, + options: LearnerMetricCatalogOptions = {}, +): readonly LearnerMetricSpec[] { + const learnerMetrics = learnerOnlyMetricSpecsForAlgorithm(algorithm); + + return withEpisodeMetrics(learnerMetrics, options); +} + +export function learnerMetricSpecsFromNeuralAdapterContract( + contract: NeuralLearnerAdapterContract, + options: LearnerMetricCatalogOptions = {}, +): readonly LearnerMetricSpec[] { + return withEpisodeMetrics( + contract.metrics.map(metricSpecFromNeuralMetric), + options, + ); +} + +export function stableMetricNamesForAlgorithm( + algorithm: string, + options: LearnerMetricCatalogOptions = {}, +): readonly string[] { + return learnerMetricSpecsForAlgorithm(algorithm, options).map((metric) => metric.name); +} + +export function stableMetricNamesFromNeuralAdapterContract( + contract: NeuralLearnerAdapterContract, + options: LearnerMetricCatalogOptions = {}, +): readonly string[] { + return learnerMetricSpecsFromNeuralAdapterContract(contract, options) + .map((metric) => metric.name); +} + +export function assertLearnerMetricSpecs( + metrics: readonly LearnerMetricSpec[], +): void { + if (metrics.length === 0) { + throw new Error("[IgnitionRL] Learner metric specs must include at least one metric."); + } + + const names = new Set(); + + for (const metric of metrics) { + if (metric.name.trim().length === 0) { + throw new Error("[IgnitionRL] Learner metric names must be non-empty."); + } + + if (names.has(metric.name)) { + throw new Error(`[IgnitionRL] Duplicate learner metric name: ${metric.name}.`); + } + + names.add(metric.name); + } +} + +function learnerOnlyMetricSpecsForAlgorithm( + algorithm: string, +): readonly LearnerMetricSpec[] { + if (algorithm === TABULAR_Q_ALGORITHM) { + return TABULAR_Q_METRIC_SPECS; + } + + if (algorithm === LINEAR_POLICY_SEARCH_ALGORITHM) { + return LINEAR_POLICY_SEARCH_METRIC_SPECS; + } + + if (algorithm === "dqn" || algorithm === "ppo" || algorithm === "sac") { + return defaultMetricSpecs(algorithm).map(metricSpecFromNeuralMetric); + } + + return []; +} + +function withEpisodeMetrics( + learnerMetrics: readonly LearnerMetricSpec[], + options: LearnerMetricCatalogOptions, +): readonly LearnerMetricSpec[] { + const metrics = options.includeEpisodeMetrics === false + ? [...learnerMetrics] + : [...EPISODE_METRIC_SPECS, ...learnerMetrics]; + const deduped = dedupeMetricSpecs(metrics); + + if (deduped.length > 0) { + assertLearnerMetricSpecs(deduped); + } + + return deduped; +} + +function dedupeMetricSpecs( + metrics: readonly LearnerMetricSpec[], +): readonly LearnerMetricSpec[] { + const names = new Set(); + const deduped: LearnerMetricSpec[] = []; + + for (const metric of metrics) { + if (names.has(metric.name)) { + continue; + } + + names.add(metric.name); + deduped.push(metric); + } + + return deduped; +} + +function learnerMetric( + metric: keyof TMetrics & string, + scope: LearnerMetricScope, + direction: LearnerMetricDirection, + reducer: LearnerMetricReducer, + description: string, +): LearnerMetricSpec { + return { + name: `learner.${metric}`, + scope, + direction, + reducer, + description, + }; +} + +function metricSpecFromNeuralMetric(metric: NeuralMetricSpec): LearnerMetricSpec { + return { + name: persistedNeuralMetricName(metric.name), + scope: metric.scope === "rollout" ? "update" : metric.scope, + direction: metric.direction, + reducer: metric.reducer, + ...(metric.unit !== undefined ? { unit: metric.unit } : {}), + ...(metric.description !== undefined ? { description: metric.description } : {}), + }; +} + +function persistedNeuralMetricName(name: string): string { + if (name === "episodeReward") return "totalReward"; + if (name === "episodeLength") return "length"; + if (name === "successRate") return "success"; + + return `learner.${name}`; +} diff --git a/packages/learning/src/tabular-q.ts b/packages/learning/src/tabular-q.ts index 75d967f..b1865dd 100644 --- a/packages/learning/src/tabular-q.ts +++ b/packages/learning/src/tabular-q.ts @@ -26,14 +26,14 @@ import { type TabularQMetrics, } from "./checkpoint.js"; -const DEFAULT_CONFIG: TabularQConfig = { +export const DEFAULT_TABULAR_Q_CONFIG: TabularQConfig = Object.freeze({ learningRate: 0.2, discount: 0.95, epsilon: 0.1, initialQ: 0, observationPrecision: 2, seed: 0, -}; +}); export type TabularQOptions = Partial & { readonly metadata?: JsonObject; @@ -59,13 +59,13 @@ export class TabularQLearner< private lastTdError = 0; constructor(options: TabularQOptions = {}) { - this.config = normalizeConfig(options); + this.config = defineTabularQConfig(options); this.metadata = options.metadata; this.rng = createSeededRng(this.config.seed); } async init(spec: EnvironmentSpec, config: LearnerConfig = {}): Promise { - const nextConfig = normalizeConfig({ + const nextConfig = defineTabularQConfig({ ...this.config, ...config, }); @@ -173,6 +173,10 @@ export class TabularQLearner< }; } + getConfig(): TabularQConfig { + return { ...this.config }; + } + toCheckpoint(): TabularQCheckpoint { this.assertInitialized(); const spec = this.requireInitializedSpec(); @@ -293,6 +297,12 @@ export function createTabularQLearner< return new TabularQLearner(options); } +export function defineTabularQConfig( + options: Partial = {}, +): TabularQConfig { + return normalizeConfig(options); +} + export function transitionFromStep( step: { readonly observation: readonly number[]; @@ -322,8 +332,13 @@ export function transitionFromStep( function normalizeConfig(options: Partial = {}): TabularQConfig { const config = { - ...DEFAULT_CONFIG, - ...options, + learningRate: options.learningRate ?? DEFAULT_TABULAR_Q_CONFIG.learningRate, + discount: options.discount ?? DEFAULT_TABULAR_Q_CONFIG.discount, + epsilon: options.epsilon ?? DEFAULT_TABULAR_Q_CONFIG.epsilon, + initialQ: options.initialQ ?? DEFAULT_TABULAR_Q_CONFIG.initialQ, + observationPrecision: options.observationPrecision + ?? DEFAULT_TABULAR_Q_CONFIG.observationPrecision, + seed: options.seed ?? DEFAULT_TABULAR_Q_CONFIG.seed, }; assertUnitInterval(config.learningRate, "learningRate"); diff --git a/packages/learning/test/config-metrics.test.ts b/packages/learning/test/config-metrics.test.ts new file mode 100644 index 0000000..b4f7560 --- /dev/null +++ b/packages/learning/test/config-metrics.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, test } from "bun:test"; +import { + DEFAULT_LINEAR_POLICY_SEARCH_CONFIG, + DEFAULT_TABULAR_Q_CONFIG, + LINEAR_POLICY_SEARCH_ALGORITHM, + TABULAR_Q_ALGORITHM, + createLinearPolicySearchLearner, + createTabularQLearner, + defineLinearPolicySearchConfig, + defineNeuralLearnerAdapterContract, + defineTabularQConfig, + learnerMetricSpecsForAlgorithm, + learnerMetricSpecsFromNeuralAdapterContract, + stableMetricNamesForAlgorithm, +} from "../src/index.js"; + +describe("learner configuration", () => { + test("normalizes tabular Q defaults and overrides", () => { + expect(defineTabularQConfig()).toEqual(DEFAULT_TABULAR_Q_CONFIG); + expect(defineTabularQConfig({ + epsilon: 0.25, + observationPrecision: 0, + seed: "debug", + })).toEqual({ + learningRate: 0.2, + discount: 0.95, + epsilon: 0.25, + initialQ: 0, + observationPrecision: 0, + seed: "debug", + }); + + expect(() => defineTabularQConfig({ epsilon: 2 })).toThrow("epsilon"); + expect(createTabularQLearner({ + epsilon: 0.3, + metadata: { label: "not part of config" }, + }).getConfig()).toEqual({ + ...DEFAULT_TABULAR_Q_CONFIG, + epsilon: 0.3, + }); + }); + + test("normalizes linear policy search defaults and clones nested maps", () => { + const config = defineLinearPolicySearchConfig({ + initialWeightMap: { + thrust: { + position: 0.5, + }, + }, + }); + const learner = createLinearPolicySearchLearner(config); + const learnerConfig = learner.getConfig(); + + expect(defineLinearPolicySearchConfig()).toEqual(DEFAULT_LINEAR_POLICY_SEARCH_CONFIG); + expect(config.initialWeightMap?.thrust?.position).toBe(0.5); + expect(learnerConfig.initialWeightMap).toEqual(config.initialWeightMap); + expect(learnerConfig.initialWeightMap).not.toBe(config.initialWeightMap); + expect(() => defineLinearPolicySearchConfig({ + eliteCount: 4, + populationSize: 2, + })).toThrow("eliteCount"); + }); +}); + +describe("learner metric catalogs", () => { + test("exposes stable persisted metric names for baseline learners", () => { + expect(stableMetricNamesForAlgorithm(TABULAR_Q_ALGORITHM)).toEqual([ + "totalReward", + "length", + "success", + "terminated", + "truncated", + "learner.states", + "learner.transitions", + "learner.episodes", + "learner.lastTdError", + ]); + expect(learnerMetricSpecsForAlgorithm( + LINEAR_POLICY_SEARCH_ALGORITHM, + { includeEpisodeMetrics: false }, + ).map((metric) => metric.name)).toContain("learner.lastPopulationRewardStdDev"); + }); + + test("maps neural adapter metrics onto persisted run metric names", () => { + const contract = defineNeuralLearnerAdapterContract({ + id: "DebugDiscrete-v0", + observation: { type: "vector", shape: [4], dtype: "float32" }, + actions: { type: "discrete", values: ["left", "right"] }, + }, { + algorithm: "dqn", + }); + const metricNames = learnerMetricSpecsFromNeuralAdapterContract(contract) + .map((metric) => metric.name); + + expect(metricNames).toContain("totalReward"); + expect(metricNames).toContain("length"); + expect(metricNames).toContain("success"); + expect(metricNames).toContain("learner.tdLoss"); + expect(metricNames).toContain("learner.epsilon"); + expect(new Set(metricNames).size).toBe(metricNames.length); + }); +}); diff --git a/packages/learning/test/type-inference.ts b/packages/learning/test/type-inference.ts index 85b4a77..8c03b74 100644 --- a/packages/learning/test/type-inference.ts +++ b/packages/learning/test/type-inference.ts @@ -3,7 +3,9 @@ import { createLinearPolicySearchLearner, createNeuralCheckpointEnvelope, createTabularQLearner, + defineTabularQConfig, defineNeuralLearnerAdapterContract, + learnerMetricSpecsForAlgorithm, trainLinearPolicySearch, trainTabularQ, type LinearPolicySearchCheckpoint, @@ -54,6 +56,12 @@ const neuralContract: NeuralLearnerAdapterContract = defineNeuralLearnerAdapterC const neuralCheckpoint = createNeuralCheckpointEnvelope(neuralContract, { qNetwork: "runs/tiny/checkpoints/q-network.json", }); +const tabularConfig = defineTabularQConfig({ + epsilon: 0.2, + observationPrecision: 1, +}); +const tabularMetricNames = learnerMetricSpecsForAlgorithm("tabular-q-learning") + .map((metric) => metric.name); const checkpoint: TabularQCheckpoint = { version: 1, algorithm: "tabular-q-learning", @@ -130,8 +138,11 @@ async function smoke(): Promise { acceptsTinyAction(action); const acceptsNeuralCheckpointEnv = (_envId: "Tiny-v0") => undefined; + const acceptsTabularConfigSeed = (_seed: number | string) => undefined; acceptsNeuralCheckpointEnv(neuralCheckpoint.envId as "Tiny-v0"); + acceptsTabularConfigSeed(tabularConfig.seed); + tabularMetricNames.includes("learner.transitions"); learner.loadCheckpoint(checkpoint, TinyEnv.getSpec()); await continuousLearner.init(TinyContinuousEnv.getSpec(), {}); diff --git a/packages/sdk/src/experiment.ts b/packages/sdk/src/experiment.ts index 7d43c44..d9bb059 100644 --- a/packages/sdk/src/experiment.ts +++ b/packages/sdk/src/experiment.ts @@ -6,6 +6,7 @@ import type { EnvironmentSpec, EpisodeTrace, JsonObject, + JsonValue, Learner, LearnerConfig, Metrics, @@ -235,16 +236,20 @@ export async function runLearnerExperiment< } const algorithm = options.algorithm ?? options.learner.name; - const run = await options.store.createRun(createRunOptions(options, algorithm, episodes)); + const spec = options.env.getSpec(options.seed === undefined ? {} : { seed: options.seed }); + const initConfig = cloneLearnerConfig(options.learnerConfig ?? {}, "learnerConfig"); + + await options.learner.init(spec, initConfig); + + const learnerConfig = resolveLearnerConfig(options.learner, options.learnerConfig); + const run = await options.store.createRun(createRunOptions({ + ...options, + ...(learnerConfig !== undefined ? { learnerConfig } : {}), + }, algorithm, episodes)); await options.onRun?.(await options.store.updateRun(run.id, { status: "running" })); try { - await options.learner.init( - options.env.getSpec(options.seed === undefined ? {} : { seed: options.seed }), - options.learnerConfig ?? {}, - ); - const runner = options.env.createRunner({ runId: run.id, collectTrace: true, @@ -504,6 +509,7 @@ function createRunOptions( readonly seed?: Seed; readonly maxSteps?: number; readonly config?: JsonObject; + readonly learnerConfig?: LearnerConfig; readonly metadata?: JsonObject; }, algorithm: string, @@ -515,9 +521,12 @@ function createRunOptions( algorithm, ...(options.seed !== undefined ? { seed: options.seed } : {}), config: { + ...(options.config ?? {}), episodes, ...(options.maxSteps !== undefined ? { maxSteps: options.maxSteps } : {}), - ...(options.config ?? {}), + ...(options.learnerConfig !== undefined + ? { learnerConfig: options.learnerConfig } + : {}), }, ...(options.metadata !== undefined ? { metadata: options.metadata } : {}), }; @@ -596,6 +605,91 @@ function metricValuesForEpisode(episode: LearnerExperimentEpisode): Record( + learner: Learner, + explicitConfig: LearnerConfig | undefined, +): LearnerConfig | undefined { + const maybeConfigurable = learner as Learner & LearnerWithConfig; + + if (typeof maybeConfigurable.getConfig === "function") { + return cloneLearnerConfig(maybeConfigurable.getConfig(), "learner.getConfig()"); + } + + if (explicitConfig !== undefined) { + return cloneLearnerConfig(explicitConfig, "learnerConfig"); + } + + return undefined; +} + +function cloneLearnerConfig(value: LearnerConfig, label: string): LearnerConfig { + assertSerializableJsonObject(value, label); + + return JSON.parse(JSON.stringify(value)) as LearnerConfig; +} + +function assertSerializableJsonObject( + value: unknown, + label: string, +): asserts value is JsonObject { + if (!isPlainObject(value)) { + throw new Error(`[IgnitionRL] ${label} must be a JSON object.`); + } + + for (const [key, nested] of Object.entries(value)) { + assertSerializableJsonValue(nested, `${label}.${key}`); + } +} + +function assertSerializableJsonValue( + value: unknown, + label: string, +): asserts value is JsonValue { + if ( + value === null + || typeof value === "string" + || typeof value === "boolean" + ) { + return; + } + + if (typeof value === "number") { + if (!Number.isFinite(value)) { + throw new Error(`[IgnitionRL] ${label} must be a finite JSON number.`); + } + return; + } + + if (Array.isArray(value)) { + value.forEach((entry, index) => + assertSerializableJsonValue(entry, `${label}[${index}]`)); + return; + } + + if (isPlainObject(value)) { + for (const [key, nested] of Object.entries(value)) { + assertSerializableJsonValue(nested, `${label}.${key}`); + } + return; + } + + throw new Error(`[IgnitionRL] ${label} must be JSON serializable.`); +} + +function isPlainObject(value: unknown): value is Record { + if (typeof value !== "object" || value === null || Array.isArray(value)) { + return false; + } + + const prototype = Object.getPrototypeOf(value); + + return prototype === Object.prototype || prototype === null; +} + async function resolveCheckpoint( store: LocalProjectStore, runId: string, diff --git a/packages/sdk/test/experiment.test.ts b/packages/sdk/test/experiment.test.ts index bb49a03..24a7fbf 100644 --- a/packages/sdk/test/experiment.test.ts +++ b/packages/sdk/test/experiment.test.ts @@ -269,11 +269,24 @@ describe("runLearnerExperiment", () => { }, }); const metrics = await store.readMetrics("learner-run"); + const run = await store.readRun("learner-run"); const traces = await store.listTraces("learner-run"); const checkpoints = await store.listCheckpoints("learner-run"); const checkpoint = await store.readCheckpoint<{ algorithm: string }>("learner-run", "final"); expect(result.run.status).toBe("completed"); + expect(run.config).toMatchObject({ + episodes: 80, + maxSteps: 5, + learnerConfig: { + learningRate: 0.5, + discount: 0.95, + epsilon: 0.25, + initialQ: 0, + observationPrecision: 0, + seed: 13, + }, + }); expect(result.summary.episodes).toBe(80); expect(result.summary.successRate).toBeGreaterThan(0); expect(result.episodes).toHaveLength(80); @@ -287,6 +300,29 @@ describe("runLearnerExperiment", () => { }); }); + test("rejects invalid learner configs before creating a run", async () => { + await withProjectDir(async (dir) => { + const store = await createIgnitionProject(dir, { + id: "demo", + name: "Demo Project", + }); + + await expect( + runLearnerExperiment({ + store, + env: LineWorld, + learner: createTabularQLearner<"left" | "right">(), + learnerConfig: { + epsilon: 2, + }, + runId: "invalid-config-run", + }), + ).rejects.toThrow("epsilon"); + + expect(await store.listRuns()).toEqual([]); + }); + }); + test("marks learner runs as failed when learner execution throws", async () => { await withProjectDir(async (dir) => { const store = await createIgnitionProject(dir, {