IgnitionAI · salim4n · May 29, 2026 · May 29, 2026
diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
@@ -20,6 +20,7 @@ import {
   type Target2DDemoOptions,
 } from "@ignitionrl/examples";
 import {
+  createDqnLearner,
   createLinearPolicySearchLearner,
   createTabularQLearner,
 } from "@ignitionrl/learning";
@@ -3460,6 +3461,17 @@ async function runKnownCheckpointInference(
     });
   }
 
+  if (sourceRun.envId === GridWorld.id && sourceRun.algorithm === "dqn") {
+    return runCheckpointPolicyExperiment({
+      ...common,
+      env: GridWorld,
+      learner: createDqnLearner<GridWorldAction>(),
+      policyOptions: {
+        explore: false,
+      },
+    });
+  }
+
   if (sourceRun.envId === Target2D.id && sourceRun.algorithm === "tabular-q-learning") {
     return runCheckpointPolicyExperiment({
       ...common,
@@ -3471,6 +3483,17 @@ async function runKnownCheckpointInference(
     });
   }
 
+  if (sourceRun.envId === Target2D.id && sourceRun.algorithm === "dqn") {
+    return runCheckpointPolicyExperiment({
+      ...common,
+      env: Target2D,
+      learner: createDqnLearner<Target2DAction>(),
+      policyOptions: {
+        explore: false,
+      },
+    });
+  }
+
   if (sourceRun.envId === DroneTarget.id && sourceRun.algorithm === "linear-policy-search") {
     return runCheckpointPolicyExperiment({
       ...common,

diff --git a/packages/learning/README.md b/packages/learning/README.md
@@ -2,9 +2,10 @@
 
 Baseline learning helpers for IgnitionRL.
 
-This package intentionally starts small. It does not implement PPO, DQN, SAC, tensors or native backends. The first learners are deterministic baselines for validating environment loops, transitions, metrics and checkpoint plumbing:
+This package intentionally starts small. It does not implement PPO, SAC, native tensor backends or high-throughput training. The first learners validate environment loops, transitions, metrics and checkpoint plumbing:
 
 - tabular Q-learning for discrete environments;
+- DQN for small deterministic discrete debug environments;
 - linear policy search for bounded continuous environments.
 
 ## Tabular Q-Learning
@@ -27,6 +28,32 @@ const action = result.learner.selectAction([0, 0, 3, 3], { explore: false })
 console.log(action)
 ```
 
+## DQN
+
+```ts
+import { trainDqn } from "@ignitionrl/learning"
+import { GridWorld } from "@ignitionrl/examples"
+
+const result = await trainDqn(GridWorld, {
+  episodes: 200,
+  maxSteps: 50,
+  seed: 42,
+  learnerOptions: {
+    hiddenSize: 32,
+    epsilonStart: 0.4,
+    epsilonEnd: 0.05,
+  },
+})
+
+const action = result.learner.selectAction(
+  GridWorld.createRunner({ seed: 42 }).getObservation(),
+  { explore: false }
+)
+console.log(action)
+```
+
+`DqnLearner` is the first neural learner path. It is intentionally compact: a seeded one-hidden-layer MLP, replay buffer, epsilon decay and target network for discrete toy/debug environments. It is not the final high-throughput backend, but it proves that neural learners can train, emit stable metrics and checkpoint behind the same environment contract.
+
 ## Linear Policy Search
 
 ```ts
@@ -82,6 +109,7 @@ const metrics = learnerMetricSpecsForAlgorithm("tabular-q-learning")
 Default baseline configs are:
 
 - `tabular-q-learning`: `learningRate: 0.2`, `discount: 0.95`, `epsilon: 0.1`, `initialQ: 0`, `observationPrecision: 2`, `seed: 0`;
+- `dqn`: `learningRate: 0.01`, `discount: 0.95`, `epsilonStart: 0.4`, `epsilonEnd: 0.05`, `epsilonDecaySteps: 500`, `batchSize: 16`, `replayCapacity: 5000`, `minReplaySize: 16`, `trainEverySteps: 1`, `targetUpdateInterval: 50`, `hiddenSize: 32`, `gradientClip: 5`, `weightScale: 0.05`, `seed: 0`;
 - `linear-policy-search`: `sigma: 0.2`, `actionNoise: 0.03`, `initialWeightScale: 0.05`, `populationSize: 6`, `eliteCount: 2`, `seed: 0`.
 
 Default neural adapter cadences are:
@@ -112,7 +140,7 @@ Checkpoints are JSON-serializable and include:
 - action shape and bounds, or discrete action values;
 - learner config;
 - metrics;
-- learned Q-table or policy weights.
+- learned Q-table, DQN network weights or policy weights.
 
 `linear-policy-search` keeps v1 checkpoint loading backward-compatible when newer diagnostic metrics are missing. Loading normalizes those fields before inference, so older demo artifacts can still be replayed through the current learner.
 
@@ -146,7 +174,7 @@ Built-in support profiles are intentionally conservative:
 
 Unsupported combinations fail before a run starts with an algorithm-specific error. Custom neural adapters must declare their supported action spaces explicitly.
 
-Current `TabularQLearner` and `LinearPolicySearchLearner` remain direct TypeScript learners. Future DQN/PPO/SAC implementations can sit behind this contract whether the backend is TypeScript, Rust/Burn, Rust/Candle or another native process. Environment authors still only implement `defineEnvironment()`.
+Current `TabularQLearner`, `DqnLearner` and `LinearPolicySearchLearner` remain direct TypeScript learners. Future PPO/SAC implementations, and heavier DQN backends, can sit behind this contract whether the backend is TypeScript, Rust/Burn, Rust/Candle or another native process. Environment authors still only implement `defineEnvironment()`.
 
 ## Scope