From 3b4ac1e793a8f1c817d1e79d1c132dbc4961c38c Mon Sep 17 00:00:00 2001 From: Tyler Reckart Date: Mon, 22 Jun 2026 08:36:02 -0400 Subject: [PATCH 1/4] hardware design --- examples/3bo/BOM.md | 297 +++++++ examples/3bo/CIRCUIT.md | 388 +++++++++ examples/3bo/FIRMWARE.md | 218 +++++ examples/3bo/JETSON.md | 720 +++++++++++++++++ examples/3bo/README.md | 177 +++++ examples/3bo/VISION.md | 488 ++++++++++++ examples/3bo/bridge/README.md | 64 ++ examples/3bo/bridge/bridge.py | 61 ++ examples/3bo/bridge/bridge_stub.py | 146 ++++ examples/3bo/firmware/README.md | 65 ++ .../threebo_config.example.h | 28 + .../threebo_nano_esp32/threebo_nano_esp32.ino | 551 +++++++++++++ examples/voice-bridge/README.md | 130 +++ examples/voice-bridge/bridge.py | 743 ++++++++++++++++++ include/advisor.h | 63 ++ src/advisor.cpp | 100 +++ src/orchestrator.cpp | 83 +- 17 files changed, 4251 insertions(+), 71 deletions(-) create mode 100644 examples/3bo/BOM.md create mode 100644 examples/3bo/CIRCUIT.md create mode 100644 examples/3bo/FIRMWARE.md create mode 100644 examples/3bo/JETSON.md create mode 100644 examples/3bo/README.md create mode 100644 examples/3bo/VISION.md create mode 100644 examples/3bo/bridge/README.md create mode 100644 examples/3bo/bridge/bridge.py create mode 100644 examples/3bo/bridge/bridge_stub.py create mode 100644 examples/3bo/firmware/README.md create mode 100644 examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h create mode 100644 examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino create mode 100644 examples/voice-bridge/README.md create mode 100644 examples/voice-bridge/bridge.py create mode 100644 include/advisor.h create mode 100644 src/advisor.cpp diff --git a/examples/3bo/BOM.md b/examples/3bo/BOM.md new file mode 100644 index 0000000..6885657 --- /dev/null +++ b/examples/3bo/BOM.md @@ -0,0 +1,297 @@ +# 3bo bill of materials + +This BOM targets the first 3bo prototype: Jetson Orin local brain, Arduino Nano +ESP32 body controller, wake-word listening, I2S microphone input, I2S speaker +output, status LEDs, physical mute, and a consistent bench power setup. + +Prices vary by vendor and region, so the cost column is a planning estimate, +not a purchasing quote. + +Power note: the recommended mobile prototype uses one 4S battery pack feeding +the Jetson's 19 V rail. The Nano ESP32 is powered by, and communicates over, +USB-C from a Jetson USB host port. Keep the LED/audio loads small enough for +the verified USB/body 5 V budget, or add a Jetson-powered USB hub/accessory +5 V rail after measuring current. + +## Core electronics + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | ---------------------------------------- | --------------------------------------------------------------------------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | Local Linux/AI brain | NVIDIA Jetson Orin Nano Super Developer Kit | 249+ | Runs 3bo bridge, Arbiter daemon, local STT/TTS, logs, and model storage. | +| 1 | Jetson storage | microSD card or NVMe SSD supported by the carrier | 15-80 | Prefer NVMe if the carrier/setup supports it; local STT models and logs benefit from fast storage. | +| 1 | Jetson bench power supply | Vendor-recommended supply for the exact Jetson carrier board | varies | Use this for bring-up before moving to battery power. | +| 1 | Jetson cooling — standard | Vendor-supplied active cooler (ships with dev kit) | included | Required for bench bring-up. Tallest option (\~35 mm above module). | +| 1 | Jetson cooling — low-profile | Thin copper/aluminium heatsink plate (≤8 mm) + Noctua A4x10 FLX fan (40×40×10 mm) | 20–35 | Saves \~18–20 mm of body height vs the stock cooler. Requires ventilation slots in the body enclosure (one inlet, one outlet). Test sustained STT+TTS+vision load with `tegrastats` — throttle above 70 °C indicates a need for a thicker heatsink or better airflow. | +| 1 | Microcontroller | Arduino Nano ESP32 | 25-35 | Main controller. ESP32-S3, 3.3 V I/O, USB-C. Powered by the Jetson USB host link in the wired build. | +| 1 | I2S MEMS microphone | Adafruit ICS-43434 I2S MEMS microphone breakout, product 6049 | 5 | Digital mono microphone. Power from 3.3 V; not for 5 V logic. | +| 1 | I2S audio amplifier | Adafruit MAX98357A I2S 3 W class-D amplifier breakout, product 3006 | 6 | Drives the speaker directly from I2S audio. Runs from 2.7-5.5 V and accepts 3.3 V logic. | +| 1 | Breadboard speaker | Adafruit breadboard-friendly PCB mount mini speaker, 8 ohm 0.2 W, product 1898 | 2 | Quiet first-test speaker. Do not overdrive it with the MAX98357A. | +| 1 | Final enclosure speaker | 8 ohm 1-3 W small speaker | 3-8 | Optional upgrade once the audio path works. Better suited to spoken responses than the 0.2 W breadboard speaker. | +| 1 | Addressable LED indicator | Adafruit NeoPixel Stick, 8 x 5050 RGBW cool white, product 2869 | 8 | Main 3bo status indicator. 5 V power, one data pin, RGBW library required. | +| 1 | USB-C data cable | Short, data-capable USB-C cable from Jetson USB host to Nano ESP32 | 5-15 | Carries power and serial data between Jetson and Nano. Avoid charge-only cables. | +| 1 | USB 5 V breakout or measured VBUS access | USB-C breakout, powered USB hub, or carrier-approved 5 V accessory output | varies | Optional only if the NeoPixel/amp need more 5 V current than the Nano exposes safely. Verify current limits before use. | +| 1 | 5 V body rail, optional | Jetson-powered USB hub/accessory 5 V rail, current-limited/fused | 15-40 | Use only if speaker/LED tests exceed the safe USB/Nano 5 V budget. Not battery-fed separately. | +| 1 | Breadboard power supply kit | Adafruit adjustable breadboard power supply kit, product 184 | 15 | Bench/test supply or low-current 3.3 V peripheral rail. Do not use as the main 5 V LED/audio rail from 12 V. | +| 1 | Optional 3.3 V regulator | 5 V to 3.3 V regulator module | 2-6 | Only needed if you want a separate 3.3 V rail instead of using the Nano's low-current `3V3` output for the mic. | +| 1 | Physical mute switch | DPDT toggle/slide switch, or SPST switch plus load-switch control | 2-8 | Must remove microphone power, not just signal firmware. Use the second pole or a load-switch enable signal for a hard privacy mute. | +| 1 | Microphone power switch | P-channel MOSFET high-side switch or dedicated 3.3 V load switch module | 2-8 | Disconnects the ICS-43434 3.3 V rail when muted. Firmware also reads mute state, but privacy does not depend on firmware. | +| 1 | Power switch | SPST toggle or slide switch rated for expected input current | 1-4 | Main Jetson/battery input disconnect for the 19 V supply path. | + +## Portable power subsystem + +For the first battery-powered prototype, use an external balance charger and +remove the pack from 3bo for charging. That is simpler and safer than adding +onboard charging before the load profile is measured. + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------- | +| 1 | Prototype battery pack | 4S LiPo, 14.8 V nominal / 16.8 V full, 5000-6000 mAh, XT60 connector | 40-80 | Good first capacity target. Use a hardcase RC pack if the enclosure can fit it. | +| 1 | Balance charger | SkyRC IMAX B6AC V2, HOTA D6 Pro, or equivalent charger that supports 4S LiPo/Li-ion balance charging | 45-130 | Charge the pack outside the robot for v1. Use the LiPo/Li-ion program that matches the pack chemistry. | +| 1 | Main fuse holder | Inline ATO/ATC blade fuse holder, 16 AWG or heavier | 5-10 | Install as close to the battery positive lead as practical. | +| 2-3 | Blade fuses | 5 A and 10 A ATO/ATC fuses | 3-8 | Start with 5 A for bench tests; size up only after measuring normal current and startup surge. | +| 1 | Main disconnect switch | DC-rated rocker/toggle switch, at least 10 A at 24 V DC | 5-15 | Put after the fuse. Avoid tiny AC-only panel switches for the battery main. | +| 1 | Jetson 19 V regulator | 4S-compatible buck-boost or boost regulator, 19 V output, at least 3 A continuous, preferably 4-5 A with cooling | 20-60 | Feeds the Jetson DC input. Must pass the regulator acceptance tests below before connecting the Jetson. | +| 1 | Jetson barrel lead | 5.5 mm x 2.5 mm center-positive DC plug pigtail | 3-8 | Match the exact Jetson carrier board connector. Keep polarity labeled. | +| 1 | Low-voltage cutoff | 4S LiPo/Li-ion low-voltage disconnect or protected 4S pack/BMS with load cutoff | 15-60 | Required for unattended or enclosed battery use. A buzzer alone is not protection. | +| 1 | Low-voltage monitor | 1S-8S LiPo cell checker/alarm with balance-plug input | 5-12 | Bench diagnostic only. Set a conservative alarm threshold, but do not rely on it as the cutoff. | +| 1 | XT60 harness kit | XT60 male/female pigtails, heat-shrink, 16-18 AWG silicone wire | 8-20 | Keeps battery wiring serviceable. Use strain relief. | +| 1 | Final-product battery option | 4S Li-ion pack with integrated 4S BMS, 5000-7000 mAh | 60-140 | Better fit for an enclosed stationary robot than a hobby LiPo. Still needs a charger matched to the pack/BMS. | + +## Passives and wiring + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | ------------------------------- | ------------------------------------------------------------------ | -------- | -------------------------------------------------------------------------------------------------------------- | +| 1 | Bulk capacitor | 470-1000 uF electrolytic, 10 V or higher | 1-3 | Place across 5 V and GND near the NeoPixel stick/audio amp. | +| 1 | LED data resistor | 330-470 ohm resistor | \<1 | Place in series with the WS2812/SK6812 data line. | +| 1 | Pull resistor kit | 10 kOhm resistors | \<1 | Useful for mute switch input if not using internal pullups. | +| 1 | Breadboard or perfboard | Solderless breadboard for first test; perfboard for portable build | 5-12 | Move to perfboard once pinout and USB-powered load current are stable. | +| 1 | Jumper wire kit | Male/male and male/female Dupont wires | 5-10 | Keep I2S wires short during testing. | +| 1 | Header pins / screw terminals | 0.1 inch headers, optional terminal blocks | 2-6 | Makes power, speaker, and LED wiring less fragile. | +| 1 | Spare USB-C cable | Data-capable | 3-10 | Programming and serial diagnostics. Charge-only cables will not work for the Jetson serial link. | +| 1 | Ethernet cable or Wi-Fi adapter | Ethernet for bring-up; supported Wi-Fi for untethered use | varies | Jetson Orin Nano dev kits are easiest to bring up over Ethernet. Add wireless only after the base stack works. | + +## Enclosure and mechanical + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | ----------------------- | -------------------------------------------------------- | -------- | ------------------------------------------------------------------------- | +| 1 | Prototype enclosure | Small plastic project box, 3D print, or laser-cut shell | 5-20 | Leave openings for mic, speaker, LEDs, USB-C, power, and mute. | +| 1 | Speaker grille/material | Printed grille, perforated panel, or fabric | 1-5 | Avoid sealing the speaker behind solid plastic. | +| 1 | Diffuser | Frosted acrylic, translucent print, or silicone diffuser | 2-10 | Makes the LED states feel much more polished. | +| 1 | Mounting hardware | M2/M3 screws, standoffs, adhesive pads | 3-10 | Keep the mic mechanically isolated from speaker vibration where possible. | + +## Vision and motion subsystem + +These parts are planned for **Milestone 5** and are not needed for the initial voice prototype. Design the mechanical neck with servo pockets in v1 so it can be upgraded without a rebuild. + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | -------------------------- | -------------------------------------------------------------------------------------------------------- | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | Head camera | Adafruit OV5640 Camera Breakout — 72° Lens with Autofocus, product 5945 | 15 | Mounted in robot head. 8-bit parallel DVP to ESP32-S3 camera peripheral via jumper wires on prototype; custom carrier board for final build. 72° non-distorting lens. ESP32-S3 JPEG-compresses frames and forwards to Jetson over USB serial. | +| 1 | Servo driver board | Adafruit PCA9685 16-channel 12-bit PWM servo driver, product 815 | 15 | I2C address 0x40. Ch 0 = Servo L, ch 1 = Servo R, ch 2–3 = base motor DRV8833 IN1/IN2. | +| 2 | Neck servos | MG90S metal-gear micro servo, or equivalent 9 g metal-gear servo | 4–8 each | Differential push/pull rod mechanism. Servo L + Servo R together = pitch (±30°). Servo L − Servo R = roll (±15°). 50 Hz, 500–2400 µs. Metal gears for longevity. | +| 1 | Neck mechanism bracket | Custom 3D-printed neck with rear ball-joint pivot, front rod attachment points, and servo mounts at base | 5–15 (print) | Three-arm differential design: one passive rear pivot (ball joint), two push/pull rods at front (35 mm apart). Servos mount at neck base. Include rod channel and cable bore through neck centre. | +| 1 | N20 gearmotor with encoder | N20 6 V, 100–200 RPM output, with quadrature magnetic encoder (e.g. Pololu #4825 or equivalent) | 10–18 | Drives base rotation. Encoder provides position feedback for closed-loop yaw control. Choose gear ratio so base turns at \~20–60°/s under load. | +| 1 | Motor driver breakout | Adafruit DRV8833 dual H-bridge motor driver breakout, product 3297 | 5 | Controls N20 direction and speed via two PWM signals from PCA9685 channels 2 and 3. 1.5 A per channel — well within N20 stall current. | +| 1 | Base bearing | Lazy Susan bearing, 100–150 mm diameter | 5–12 | Supports body weight through full 360° rotation. Choose a ball-bearing type rated for the estimated body load (≥ 2 kg). | +| 1 | Base motor mount | Custom 3D-printed gear/pinion drive bracket | 3–8 (print) | Positions the N20 against the bearing rim or inner race. Design drive ratio for target rotation speed. A small pinion on the N20 output shaft driving a printed ring gear on the base is the cleanest approach. | +| 1 | Slip ring | Capsule slip ring, 12-wire, 2 A per circuit (e.g. Adafruit product 736 or similar) | 15–25 | Passes power and signals through the rotating base joint. Routes 19 V Jetson supply, 5 V body rail, GND, motor IN1/IN2, and encoder A/B from the stationary base to the rotating body. | +| 1 | I2C cable / Dupont wires | 4-wire female–female Dupont cable, 150–200 mm | 2–5 | Connects Jetson 40-pin header (I2C pins 3, 5) + 3.3 V (pin 1) + GND to PCA9685 in head tier. | +| 1 | Servo and motor power | Wired from regulated 5 V body rail to PCA9685 V+ and DRV8833 VM | 0–3 | PCA9685 V+ + DRV8833 VM share the 5 V rail. Total peak draw (2 servos + N20): \~700 mA. Verify Jetson header or body regulator can supply it. | + +## Optional but useful + +| Qty | Item | Suggested spec/example | Est. USD | Notes | +| --- | --------------------------- | --------------------------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------- | +| 1 | Logic level shifter | 74AHCT/74HCT 3.3 V to 5 V data level shifter | 2-6 | Recommended for reliable 5 V NeoPixel data from the Nano ESP32's 3.3 V GPIO. | +| 1 | USB power meter | Inline USB-C meter | 8-20 | Useful while estimating Nano ESP32 current during firmware development. | +| 1 | Small development speaker | Extra 8 ohm speaker | 2-5 | Handy for testing audio without mounting the final speaker. | +| 1 | Analog microphone module | Adafruit MAX9814 electret microphone amplifier with AGC, product 1713 | 8 | Preferred optional ADC-based mic for sound-level experiments when volume varies. Not the wake-word/speech mic. | +| 1 | Analog microphone module | Adafruit MAX4466 electret microphone amplifier, product 1063 | 7 | Alternate optional ADC-based mic with adjustable gain. Simpler, but no automatic gain control. | +| 1 | Rechargeable battery option | 4S LiPo prototype pack or 4S Li-ion pack with BMS | varies | Better runtime than PP3 9 V, but more safety and charging complexity. See the portable power subsystem above. | + +## Recommended first purchase + +For the smallest useful prototype order: + +- NVIDIA Jetson Orin Nano Super Developer Kit +- Jetson storage and power supply matched to the exact carrier board +- Jetson active cooling +- Ethernet cable or supported Wi-Fi adapter +- Arduino Nano ESP32 +- Short data-capable USB-C cable from Jetson to Nano +- Adafruit ICS-43434 I2S microphone breakout, product 6049 +- Adafruit MAX98357A I2S amplifier breakout, product 3006 +- Adafruit breadboard-friendly 8 ohm 0.2 W mini speaker, product 1898 +- Adafruit NeoPixel Stick 8 x RGBW cool white, product 2869 +- Adafruit adjustable breadboard power supply kit, product 184 +- DPDT hard-mute switch, or SPST switch plus microphone load-switch circuit +- SPST power switch +- Microphone power-switch/load-switch parts for hard mute +- 74AHCT/74HCT data level shifter for the NeoPixel stick +- 470-1000 uF capacitor +- 330-470 ohm resistor +- Jumper wires and breadboard/perfboard + +For **Milestone 5** (vision and tracking), add: Adafruit OV5640 Camera Breakout 72° with Autofocus (product 5945), Adafruit PCA9685 servo driver (product 815), 2× MG90S metal-gear micro servos, 3D-printed three-arm differential neck bracket, M3 ball-link rod ends (×6), M3 threaded rod (\~60 mm per rod), I2C wiring, N20 gearmotor with encoder, DRV8833 breakout, lazy Susan bearing, capsule slip ring. + +For the first portable-power add-on: + +- 4S 5000-6000 mAh LiPo pack with XT60 connector +- 4S-capable balance charger +- Inline blade fuse holder plus 5 A and 10 A fuses +- DC-rated main power switch +- 19 V buck-boost/boost regulator for the Jetson, at least 3 A continuous +- 4S low-voltage cutoff or protected 4S pack/BMS +- 1S-8S LiPo cell checker/alarm for bench diagnostics +- XT60 pigtails, 5.5 mm x 2.5 mm Jetson barrel pigtail, heat-shrink, and strain relief +- Optional Jetson-powered USB hub or current-limited 5 V accessory rail if LED/audio current exceeds the verified USB budget + +## Power budget notes + +Treat the Jetson as a separate high-current compute load. Use the +vendor-recommended Jetson supply during bench bring-up. For an enclosed +single-input robot, add a dedicated regulator sized for the exact Jetson carrier +input and power budget. + +The Nano ESP32 should be powered from the Jetson over the same USB-C cable used +for the serial link. Do not connect the battery pack or a battery-derived buck +regulator to Nano `VIN` in the wired Jetson build. This keeps the battery +system concerned only with the Jetson input rail and avoids USB/VIN backfeed +questions. + +The 5 V LED/audio rail must be treated as a measured USB-powered load. For the +quiet first prototype, keep NeoPixel brightness and speaker volume low and +verify the Jetson USB port, Nano board, and wiring stay within their safe +current limits. If the MAX98357A and NeoPixel stick need more current than the +USB/Nano path can provide, add a Jetson-powered USB hub or current-limited 5 V +accessory rail; do not add a separate battery branch just for the Nano body. + +A 4S 5000 mAh pack is about 74 Wh before conversion losses and reserve: + +```text +14.8 V nominal * 5 Ah = 74 Wh +``` + +Budget a real-world usable fraction rather than draining the pack flat. A +Jetson-heavy STT/TTS workload can pull the robot into the 25-35 W range, so a +5000 mAh 4S pack is a roughly 1.5-2.5 hour prototype battery, not an all-day +power source. Portable battery mode requires a hard low-voltage cutoff or a +protected 4S pack/BMS. A balance-plug buzzer is useful while you are nearby on +the bench, but it is not a product safety mechanism. + +Before connecting the Jetson to a battery regulator, run these acceptance tests: + +| Test | Pass condition | +| ----------------- | ----------------------------------------------------------------------------- | +| No-load voltage | 19.0 V nominal, no startup overshoot above the carrier's allowed input range. | +| Dummy load | Holds 19 V for at least 30 minutes at 3 A without thermal shutdown. | +| Load step | Recovers cleanly when switching between light load and 3 A load. | +| Low-pack test | Still regulates when the 4S pack is near cutoff voltage. | +| Polarity check | Center-positive barrel wiring confirmed with a meter at the plug. | +| Branch protection | Jetson branch has its own fuse or protected distribution path. | + +The Adafruit product 184 kit is convenient and breadboard-friendly, but it is +based on a low-dropout linear regulator rather than a switching buck converter. +It is not part of the preferred runtime power path. Use it for bench testing or +a low-current isolated peripheral experiment, not in parallel with the Jetson +USB-powered Nano/body rail. + +A conservative first power target: + +| Load | Planning current | +| --------------------------- | --------------------------------------------------------------------- | +| Jetson Orin Nano Super | 7-25 W depending on power mode and workload | +| Nano ESP32 over USB serial | 150-300 mA bursts | +| MAX98357A speaker path | 50-600 mA depending on volume and speaker; keep low on USB power | +| NeoPixel Stick, 8 RGBW LEDs | 100-500 mA depending on brightness/color; cap brightness aggressively | +| ICS-43434 I2S microphone | under 1 mA | + +Cap LED brightness and speaker volume in firmware. A rectangular PP3 9 V +battery is no longer part of the default plan. + +The ICS-43434 microphone needs 3.3 V. The simplest first build can use the Nano +ESP32 `3V3` pin for that low-current microphone rail after the Nano is powered +from Jetson USB. If you use product 184 or another regulator as a separate +3.3 V rail, use it for peripherals only; do not backfeed the Nano `3V3` pin +unless the board documentation explicitly allows it. Route the microphone's +3.3 V through the hard-mute switch or load switch so the mic is physically +unpowered while muted. + +## LED indicator choice + +The selected 3bo LED indicator is the Adafruit NeoPixel Stick with 8 x 5050 +RGBW LEDs in cool white, product 2869. It provides enough pixels for simple +states like idle breathing, wake flash, listening pulse, thinking sweep, and +speaking meter without the current draw of a larger ring or matrix. + +Power the stick from the regulated 5 V rail, tie grounds to the Nano ESP32, and +use a small series resistor on the data line. Product 2869 is RGBW, so firmware +must use a NeoPixel library/configuration that understands four channels per +pixel. + +## Microphone choice + +The selected 3bo speech microphone is the Adafruit ICS-43434 I2S breakout +because it sends digital audio directly over I2S and fits the ESP32-S3 audio +path well. + +The Adafruit MAX9814 electret microphone amplifier, product 1713, is the better +optional analog mic if you want an ADC signal for quick sound-level tests or +sound-reactive LED experiments, because its automatic gain control handles +changing volume better. + +The Adafruit MAX4466 electret microphone amplifier, product 1063, is another +optional analog mic with manually adjustable gain. Either analog module can help +compare analog and digital microphone behavior, but neither should replace the +I2S microphone for the main wake-word pipeline unless the firmware is redesigned +around analog sampling. + +## Speaker amplifier choice + +The selected 3bo speaker amplifier is the Adafruit MAX98357A I2S 3 W class-D +breakout, product 3006. It combines the I2S DAC and mono amplifier stage, so the +Nano ESP32 can stream digital audio to it directly. With a 5 V supply, the +breakout is rated up to 3.2 W into 4 ohm or 1.8 W into 8 ohm at 10% THD. + +For the first build, an 8 ohm speaker is the safer default because it draws less +current and gives the breadboard supply more margin. The Adafruit +breadboard-friendly 8 ohm 0.2 W mini speaker, product 1898, is useful for quiet +bring-up because it plugs into a breadboard or perfboard, but it must be kept at +low volume. Consider a temporary 47-100 ohm series resistor for initial tone +tests. A later 8 ohm 1-3 W speaker will sound better for actual spoken +responses. A 4 ohm speaker can be louder, but it makes the 5 V rail current +budget more important. + +## Jetson brain choice + +The selected local brain is the NVIDIA Jetson Orin Nano Super Developer Kit. +It is overkill for the first LED-and-speaker prototype, in a useful way: it can +run Arbiter locally while also hosting STT/TTS and later vision or richer robot +behaviors. + +Recommended first Jetson stack: + +- Ubuntu/JetPack on the Jetson. +- `arbiter --api` bound to `127.0.0.1:8080`. +- A 3bo bridge service bound to the LAN, for example `0.0.0.0:8081`, with + per-device shared-secret authentication and request rate limits. +- `whisper.cpp` or another local STT runtime using small models first. +- Local TTS such as Piper first, cloud TTS as an optional quality upgrade. + +Keep model files, provider keys, Arbiter tenant tokens, and conversation logs +on the Jetson, not on the Nano ESP32. + +## Reference links + +- Arduino Nano ESP32 product/spec page: https://store.arduino.cc/products/nano-esp32 +- Arduino Nano ESP32 docs: https://docs.arduino.cc/hardware/nano-esp32/ +- Adafruit MAX98357A I2S amplifier: https://www.adafruit.com/product/3006 +- Adafruit breadboard-friendly 8 ohm 0.2 W mini speaker: https://www.adafruit.com/product/1898 +- Adafruit ICS-43434 I2S microphone breakout: https://www.adafruit.com/product/6049 +- Adafruit MAX9814 electret microphone amplifier with AGC: https://www.adafruit.com/product/1713 +- Adafruit MAX4466 electret microphone amplifier: https://www.adafruit.com/product/1063 +- Adafruit NeoPixel Stick 8 x RGBW cool white: https://www.adafruit.com/product/2869 +- Adafruit adjustable breadboard power supply kit: https://www.adafruit.com/product/184 +- Pololu step-down voltage regulators: https://www.pololu.com/category/131/step-down-buck-voltage-regulators +- NVIDIA Jetson Orin Nano Super Developer Kit: https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/nano-super-developer-kit/ +- NVIDIA Jetson Orin Nano Developer Kit user guide: https://docs.nvidia.com/jetson/orin-nano-devkit/user-guide/latest/index.html +- whisper.cpp: https://github.com/ggml-org/whisper.cpp diff --git a/examples/3bo/CIRCUIT.md b/examples/3bo/CIRCUIT.md new file mode 100644 index 0000000..b9a8d8b --- /dev/null +++ b/examples/3bo/CIRCUIT.md @@ -0,0 +1,388 @@ +# 3bo circuit design + +This is the first bench circuit for 3bo: Jetson Orin local brain, Nano ESP32 +body controller, ICS-43434 I2S microphone, MAX98357A I2S amplifier, 8 ohm +breadboard speaker, RGBW NeoPixel stick through a 74AHCT/74HCT level shifter, +and a physical mute switch. + +## Power topology + +Use Jetson-powered body-controller wiring for the preferred bench build: + +```text +Jetson Orin brain + -> vendor-recommended Jetson power supply + -> Ethernet or supported Wi-Fi adapter for development network + -> USB host port + -> USB-C data cable + -> Arduino Nano ESP32 power + serial data + +Nano USB/VBUS or verified Jetson-powered 5 V body rail + -> MAX98357A VIN + -> NeoPixel +5V + -> 74AHCT/74HCT VCC + +Nano 3.3 V rail + -> hard-mute switch or load switch + -> ICS-43434 VIN + +All Nano/body electronics grounds tied together through the USB/common ground. +``` + +For portable battery testing, keep the battery concerned only with the Jetson: + +```text +4S LiPo/Li-ion pack + -> fuse near battery positive + -> hard low-voltage cutoff / protected BMS output + -> DC-rated main switch + -> 19 V Jetson regulator + -> branch fuse/protection + -> Jetson 5.5 mm x 2.5 mm center-positive input + +Jetson USB host port + -> USB-C data cable + -> Nano ESP32 power + serial data + -> low-current body electronics + +If measured LED/audio current exceeds the safe Jetson USB/Nano 5 V budget, add +a Jetson-powered USB hub or current-limited 5 V accessory rail. Do not add a +separate battery-fed Nano/body branch unless the power design is reopened. +``` + +The Jetson is the only battery-fed high-current compute load. Use the +vendor-recommended Jetson supply first. For a final single-input enclosure, add +a dedicated regulator matched to the exact Jetson carrier board input. Keep the +Nano on Jetson USB power, and treat LED/audio 5 V as a measured Jetson-powered +USB/accessory load. + +The Nano and Jetson share ground through USB. Do not also power the Nano from +`VIN` while it is connected to the Jetson USB host unless the exact board power +path has been reviewed and backfeed protection is verified. The preferred 3bo +prototype uses one Nano power source: Jetson USB. + +The Adafruit product 184 supply is a low-dropout linear regulator. It is no +longer part of the runtime power path. Keep it for bench measurement or an +isolated low-current peripheral experiment, and do not connect it in parallel +with the Jetson USB-powered Nano/body rail. + +The simplest 3.3 V source is the Nano ESP32's `3V3` pin after the Nano is +powered from Jetson USB. The ICS-43434 microphone draws very little current, so +this is a reasonable low-current load. If you use a separate 3.3 V rail, use it +for peripherals only; do not backfeed the Nano `3V3` pin unless the board +documentation explicitly allows that power-in path. Route whichever 3.3 V mic +source you choose through the hard-mute switch or a load-switch module before it +reaches the microphone. + +## Proposed Nano ESP32 pins + +The ESP32-S3 can route I2S signals flexibly, so these pins are chosen for a +clean breadboard layout rather than because I2S is fixed to them. + +| Function | Nano pin | Direction | Notes | +| --- | --- | --- | --- | +| I2S bit clock | `D2` | output | Shared by mic and amp. | +| I2S word select / LR clock | `D3` | output | Shared by mic and amp. | +| I2S mic data | `D4` | input | From ICS-43434 `DOUT` / `SD`. | +| I2S amp data | `D5` | output | To MAX98357A `DIN`. | +| NeoPixel data | `D6` | output | Goes through 74AHCT/74HCT before NeoPixel `DIN`. | +| Mute sense | `D7` | input | Firmware-visible mute state. Use internal pullup or external 10 kOhm pullup. Switch pulls to GND. | +| Amp shutdown | optional `D8` | output | Optional. Tie amp `SD`/shutdown high if not firmware-controlled. | + +Avoid `D0`/`D1` for the first build so serial/debug behavior stays boring. + +## Wiring table + +### Power and ground + +| From | To | Notes | +| --- | --- | --- | +| Jetson power supply or 19 V battery regulator | Jetson Orin carrier power input | Use the supply or regulator recommended for the exact carrier board. | +| Jetson network | Ethernet or supported Wi-Fi adapter | Use Ethernet for first bring-up if available. | +| Jetson USB host | Nano ESP32 USB-C | Runtime power and serial data for the Nano. Use a short data-capable cable. | +| Nano ESP32 `GND` | Breadboard GND rail | Required for every body signal; ground is common through USB. | +| Verified USB/VBUS 5 V or Jetson-powered accessory rail | Breadboard 5 V load rail | Feeds amp, NeoPixel, and level shifter. Verify current budget before connecting modules. | +| 5 V load rail `-` | Breadboard GND rail | Common ground for 5 V loads. | +| Nano ESP32 `3V3` | Hard-mute switch/load-switch input | Low-current 3.3 V mic rail for bench builds. Do not backfeed this pin. | +| Hard-mute switch/load-switch output | ICS-43434 `3V` / `VIN` | Microphone is physically unpowered when muted. | +| 470-1000 uF capacitor `+` | Breadboard 5 V rail | Place near NeoPixel/amp. | +| 470-1000 uF capacitor `-` | Breadboard GND rail | Observe polarity. | + +### ICS-43434 I2S microphone, product 6049 + +| ICS-43434 pin | Connects to | Notes | +| --- | --- | --- | +| `3V` / `VIN` | Hard-mute switch/load-switch output | Do not power from 5 V. This rail must be off when muted. | +| `GND` | GND rail | Common ground. | +| `BCLK` / `SCK` | Nano `D2` | I2S bit clock. | +| `WS` / `LRCL` | Nano `D3` | I2S word select. | +| `DOUT` / `SD` | Nano `D4` | I2S microphone data into Nano. | +| `SEL` / `L/R` | GND | Select one channel. Use 3V3 instead if firmware expects the other channel. | + +Mount the mic so the port faces outward and is not pressed against the table or +enclosure wall. + +### MAX98357A I2S amplifier, product 3006 + +| MAX98357A pin | Connects to | Notes | +| --- | --- | --- | +| `VIN` | 5 V rail | Keep volume low with the 0.2 W speaker. | +| `GND` | GND rail | Common ground. | +| `BCLK` | Nano `D2` | Shared I2S bit clock. | +| `LRC` / `LRCLK` | Nano `D3` | Shared I2S word select. | +| `DIN` | Nano `D5` | I2S audio data from Nano. | +| `GAIN` | leave default | Default gain is fine for bring-up. | +| `SD` / shutdown | 3V3 or optional Nano `D8` | Tie enabled for first test, or control from firmware. If tied high, use 3.3 V logic. | +| `+` speaker output | Speaker `+` | Bridge-tied output. Do not connect to GND. | +| `-` speaker output | Speaker `-` | Bridge-tied output. Do not connect to GND. | + +The MAX98357A can overpower the 0.2 W speaker. Use the lowest possible software +volume for the first test, and prefer short test tones. For safer bring-up, +place a 47-100 ohm resistor in series with one speaker lead, or use an 8 ohm +1 W speaker. + +### Breadboard speaker, product 1898 + +| Speaker pin | Connects to | Notes | +| --- | --- | --- | +| One speaker pin | MAX98357A speaker `+` | Polarity is not critical for a single speaker. | +| Other speaker pin | MAX98357A speaker `-` | Do not connect either speaker pin to GND. | + +### 74AHCT/74HCT level shifter for NeoPixel data + +Use one channel of the 74AHCT/74HCT part. Exact pin names vary by package, but +the logic is the same. + +| Level shifter pin | Connects to | Notes | +| --- | --- | --- | +| `VCC` | 5 V rail | Makes the output a 5 V logic signal. | +| `GND` | GND rail | Common ground. | +| `A1` / input | Nano `D6` | 3.3 V NeoPixel data from Nano. | +| `Y1` / output | 330-470 ohm resistor | 5 V data toward NeoPixel. | +| `OE` / output enable | active state for your part | Tie to the enabled state; many 74AHCT parts use active-low `OE`, so tie `OE` to GND. | +| `DIR` | fixed direction, if present | Tie for A-to-Y direction on bidirectional parts. Not present on simple buffers. | +| Unused inputs | GND or defined level | Do not leave CMOS inputs floating. | + +### NeoPixel Stick RGBW, product 2869 + +| NeoPixel pin | Connects to | Notes | +| --- | --- | --- | +| `5V` / `+` | 5 V rail | Cap brightness in firmware. | +| `GND` / `-` | GND rail | Common ground. | +| `DIN` | 330-470 ohm resistor from level shifter output | Use the input side of the stick, not `DOUT`. | +| `DOUT` | unconnected | Only used if chaining another NeoPixel module. | + +Firmware must configure this as RGBW, not RGB. + +### Hard mute switch + +Use the switch for two things: physically disable microphone power and tell +firmware the muted state. A DPDT switch is the simplest prototype part because +one pole can switch the microphone rail while the other drives the Nano input. +An SPST switch is acceptable only if it controls a load-switch enable and the +firmware-visible state comes from the same hard-mute signal. + +| Switch pin | Connects to | Notes | +| --- | --- | --- | +| Pole A common | ICS-43434 `3V` / `VIN` | Switched microphone power. | +| Pole A unmuted throw | Nano `3V3` or separate 3.3 V mic rail | Mic receives power only when unmuted. | +| Pole A muted throw | unconnected | Leaves mic unpowered. | +| Pole B common | Nano `D7` | Configure as `INPUT_PULLUP`. | +| Pole B muted throw | GND rail | Switch closed means muted/active-low. | +| Pole B unmuted throw | unconnected | Internal pullup reads unmuted. | + +In firmware, treat `D7 == LOW` as muted. When muted, stop wake-word detection, +ignore audio frames, and show the muted LED state. The privacy guarantee comes +from the microphone power being removed, not from this firmware branch. + +## Bring-up order + +1. Bring up the Jetson Orin separately with its vendor-recommended power supply, + cooling, storage, network, and SSH access. +2. Build and run `arbiter --api` and the 3bo bridge on the Jetson before + connecting the robot body electronics. +3. Connect the Nano to a Jetson USB host port with a data-capable USB-C cable. + Confirm the Jetson sees the Nano serial device before connecting amp/LED + loads. +4. Verify the available 5 V body rail and current budget before connecting the + MAX98357A and NeoPixel stick. Keep LED brightness and speaker volume low. +5. Verify the hard-mute switch removes microphone 3.3 V in the muted position, + then connect the ICS-43434 microphone. +6. Test NeoPixel output through the 74AHCT/74HCT at low brightness. +7. Test the MAX98357A with a very quiet generated tone or WAV playback. +8. Test the ICS-43434 by printing audio levels or recording a short buffer. +9. Confirm the firmware can run I2S input and I2S output together. If shared + BCLK/WS is problematic, split mic and amp onto separate I2S peripherals or + disable playback while listening. +10. Verify both mute paths: microphone 3.3 V physically drops to 0 V and the + firmware sees the expected active-low state. +11. Point the Nano firmware at the Jetson bridge URL and run the full loop: + wake/listen LED states, local recording, bridge upload, local STT, Arbiter, + TTS playback. + +## First-test firmware constants + +```cpp +constexpr int PIN_I2S_BCLK = D2; +constexpr int PIN_I2S_WS = D3; +constexpr int PIN_I2S_MIC = D4; +constexpr int PIN_I2S_AMP = D5; +constexpr int PIN_PIXELS = D6; +constexpr int PIN_MUTE = D7; +constexpr int PIN_AMP_SD = D8; // optional + +constexpr int PIXEL_COUNT = 8; +constexpr bool MUTE_ACTIVE_LOW = true; +``` + +These constants may need adjustment depending on the Arduino Nano ESP32 core +and I2S library used. Keep the circuit table and firmware constants in sync. + +## PCA9685 servo driver (Milestone 5) + +This section documents the servo driver wiring for Milestone 5 (vision and head tracking). It is not needed for the v1 voice prototype. + +### Jetson 40-pin header to PCA9685 + +| Jetson 40-pin header | PCA9685 pin | Notes | +| --- | --- | --- | +| Pin 1 (3.3V) | VCC | Logic supply. Do not use 5V for VCC; PCA9685 logic is 3.3V or 5V compatible but 3.3V matches Jetson GPIO. | +| Pin 2 or 4 (5V) | V+ | Servo power rail. Two MG90S servos draw up to ~500 mA peak. | +| Pin 3 (SDA, I2C1) | SDA | I2C data. | +| Pin 5 (SCL, I2C1) | SCL | I2C clock. | +| Pin 6 or 9 (GND) | GND | Common ground. | + +Default I2C address: 0x40. Confirm with `i2cdetect -y 1` on the Jetson. + +### PCA9685 to servos + +| PCA9685 channel | Servo | Purpose | Pulse range | +| --- | --- | --- | --- | +| Channel 0 | MG90S Servo L | Left push/pull rod. Pitch+roll mixed output. | 500–2400 µs at 50 Hz | +| Channel 1 | MG90S Servo R | Right push/pull rod. Pitch−roll mixed output. | 500–2400 µs at 50 Hz | + +Each servo connector (signal, VCC/+, GND) plugs directly into the corresponding PCA9685 channel header. Signal = PWM output. VCC = from V+ rail. GND = common. + +MG90S centre position (0°) ≈ 1500 µs. At 50 Hz, one period = 20 ms. Approximate endpoints: +30° ≈ 1800 µs, −30° ≈ 1200 µs. Calibrate by observing servo behaviour — endpoints vary by unit. The software sends mixed pitch/roll commands; calibrate each servo independently against the physical rod geometry. + +Do not power servos from the Nano ESP32 or the body 5V rail. The PCA9685 V+ draws from the Jetson 40-pin 5V pins (pins 2/4), which are rated up to 3A on the Orin Nano dev kit. Keep servo V+ wiring short and use 22 AWG or heavier wire for the servo power run. + +## N20 base motor and slip ring (Milestone 5) + +This section documents the base rotation motor, H-bridge driver, encoder, and +slip ring wiring. Not needed for v1. + +### Slip ring + +The slip ring sits at the center of the base bearing and passes all power and +signals between the stationary base and the rotating body. Use a 12-wire +capsule slip ring rated ≥ 2 A per circuit. + +| Slip ring circuit | Carries | From → To | +| --- | --- | --- | +| 1–2 | 19 V, 3 A (Jetson supply) | Base regulator → body Jetson input | +| 3–4 | 5 V, 1.5 A (body rail) | Base 5 V regulator → body rail | +| 5–6 | GND | Common ground | +| 7 | Motor IN1 (PWM) | PCA9685 channel 2 → DRV8833 IN1 | +| 8 | Motor IN2 (PWM) | PCA9685 channel 3 → DRV8833 IN2 | +| 9 | Encoder A | N20 encoder A → Jetson GPIO | +| 10 | Encoder B | N20 encoder B → Jetson GPIO | +| 11–12 | Spare | Reserved | + +Keep signal wires (IN1, IN2, encoder A/B) away from the high-current 19 V and +motor power wires within the slip ring bundle. Twist encoder pairs if possible. + +### DRV8833 motor driver + +The DRV8833 lives in the base tier alongside the N20. It is powered from the +base 5 V rail and controlled by PWM signals from PCA9685 channels 2 and 3 +(carried through the slip ring). + +| DRV8833 pin | Connects to | Notes | +| --- | --- | --- | +| VM | 5 V rail (base) | Motor power. N20 at 6 V is fine on a clean 5 V rail; rated max is 10.8 V. | +| GND | GND | Common ground. | +| IN1 | PCA9685 channel 2 (via slip ring) | PWM forward signal. | +| IN2 | PCA9685 channel 3 (via slip ring) | PWM reverse signal. | +| OUT1 | N20 motor terminal A | H-bridge output. | +| OUT2 | N20 motor terminal B | H-bridge output. | +| SLP (sleep) | 3.3 V or 5 V (tied high) | Pull high to enable driver. Can be tied to a Jetson GPIO for software sleep if needed. | +| FLT (fault) | Jetson GPIO (optional) | Open-drain fault indicator; pull up to 3.3 V and read on a Jetson GPIO if overcurrent monitoring is wanted. | + +### N20 encoder + +The N20 quadrature magnetic encoder outputs two square wave channels (A and B) +90° out of phase. Connect to two Jetson GPIO interrupt-capable pins. + +| Encoder wire | Connects to | Notes | +| --- | --- | --- | +| VCC | 3.3 V (base rail) | Encoder logic supply. Verify encoder VCC requirement; most N20 encoders accept 3.3–5 V. | +| GND | GND | Common ground. | +| A | Jetson GPIO (interrupt pin, via slip ring) | Channel A quadrature output. | +| B | Jetson GPIO (interrupt pin, via slip ring) | Channel B quadrature output. | + +Encoder count direction (CW vs CCW) depends on motor orientation. Determine +the positive direction during first bring-up and set the sign convention in +software. + +### PCA9685 channel allocation (updated) + +| Channel | Use | Signal type | +| --- | --- | --- | +| 0 | Pan servo | 50 Hz servo PWM | +| 1 | Tilt servo | 50 Hz servo PWM | +| 2 | Base motor IN1 (DRV8833) | PWM (0–100 % duty) | +| 3 | Base motor IN2 (DRV8833) | PWM (0–100 % duty) | +| 4–15 | Available | — | + +Drive the base motor by setting one channel to the desired duty cycle and the +other to 0 (or both to 0 to coast, both to 100 % to brake). Do not set both +channels to a non-zero duty simultaneously. + +## OV5640 camera (Milestone 5) + +The head camera is an Adafruit OV5640 Camera Breakout — 72° Lens with Autofocus +(product 5945). It connects to the Arduino Nano ESP32 via 8-bit parallel DVP. +On the prototype, wire the PiCowbell breakout's header pins to the Nano ESP32 +with jumper wires. On the final build, route signals through the custom carrier +board. + +### XCLK + +The PiCowbell breakout has an onboard 24 MHz oscillator. Enable it via the +board's XCLK jumper so the ESP32-S3 does not need to generate the clock. + +### DVP signal wiring (prototype — jumper wires) + +Consult the PiCowbell schematic for the Pico GPIO numbers that carry each +signal, then map them to any available ESP32-S3 GPIO pins. The `esp32-camera` +configuration struct assigns signals by GPIO number, so the mapping is flexible. +Keep data wires short to minimize parallel bus noise. + +| OV5640 signal | Direction | Notes | +| --- | --- | --- | +| D0–D7 | input to ESP32-S3 | 8-bit pixel data | +| VSYNC | input to ESP32-S3 | Frame sync | +| HREF | input to ESP32-S3 | Line sync | +| PCLK | input to ESP32-S3 | Pixel clock | +| SIOD | bidirectional | I2C data for autofocus (SCCB) | +| SIOC | output from ESP32-S3 | I2C clock for autofocus (SCCB) | +| RESET | output from ESP32-S3 | Active-low sensor reset | +| PWDN | output from ESP32-S3 | Active-high power-down | +| 3.3 V | power | From Nano ESP32 `3V3` or shared 3.3 V rail | +| GND | ground | Common ground | + +### Library + +Use the `esp32-camera` component (Espressif). OV5640 is a supported sensor. +Configure the pin assignments in the camera config struct to match the chosen +GPIO mapping. Set output format to JPEG and target VGA (640×480) or higher +depending on bandwidth and latency measurements. + +The Adafruit PiCowbell library is RP2040-specific and is not used here. + +## Wake-word audio format + +ESP-SR WakeNet expects 16 kHz mono signed 16-bit audio. The ICS-43434 outputs +24-bit I2S audio, so firmware must convert the incoming I2S samples into the +format expected by the wake-word engine before inference. diff --git a/examples/3bo/FIRMWARE.md b/examples/3bo/FIRMWARE.md new file mode 100644 index 0000000..179b07f --- /dev/null +++ b/examples/3bo/FIRMWARE.md @@ -0,0 +1,218 @@ +# 3bo firmware design + +Firmware owns the physical robot loop: local wake, mute, microphone capture, camera frame capture and forwarding, bridge communication, speaker playback, LEDs, and recovery. It has no knowledge of Arbiter tenant tokens, model provider keys, STT credentials, or TTS credentials — those live on the Jetson Orin in the bridge service. + +## Firmware tracks + +| Track | Build system | Purpose | +| --- | --- | --- | +| Arduino bench firmware | Arduino IDE / Arduino CLI | Bring up the current hardware: Wi-Fi, mute, NeoPixels, I2S mic, I2S amp, bridge upload, WAV playback. | +| ESP-IDF production firmware | ESP-IDF with ESP-SR | Real wake-word implementation with WakeNet/AFE, VAD, better buffering, diagnostics, and OTA. | + +Arduino is the fastest path to end-to-end audio. ESP-IDF is the right long-term home because Espressif's ESP-SR WakeNet/AFE stack requires that environment. + +## Audio format + +| Field | Value | +| --- | --- | +| Container | WAV for v1, raw PCM allowed later | +| Sample rate | 16000 Hz | +| Channels | 1 mono | +| Sample format | signed 16-bit little-endian PCM | +| Max utterance | 4-8 seconds for bench firmware | + +The ICS-43434 emits I2S audio in wider slots at higher resolution. Convert to 16 kHz mono signed 16-bit before wake inference and before bridge upload. + +## I2S strategy + +Use the I2S bus in one active direction at a time: + +1. Idle/listening: configure I2S RX for the ICS-43434 microphone. +2. Upload/thinking: stop I2S while Wi-Fi posts to the Jetson bridge. +3. Speaking: configure I2S TX for the MAX98357A amplifier. +4. Return to idle: stop TX and reconfigure RX. + +This avoids full-duplex clocking surprises during hardware bring-up. The ESP32-S3 can support richer I2S arrangements later, but sequential RX/TX matches the first product behavior: 3bo does not accept a new wake word while it is already speaking. + +## State machine + +```mermaid +stateDiagram-v2 + [*] --> boot + boot --> wifi_connecting + wifi_connecting --> idle: connected + wifi_connecting --> error: failed + idle --> muted: switch active + muted --> idle: switch inactive + idle --> wake_detected: local wake + wake_detected --> listening + listening --> uploading: utterance complete + listening --> idle: timeout / silence + uploading --> thinking: upload accepted + uploading --> error: bridge error + thinking --> speaking: response audio ready + thinking --> error: Arbiter/STT/TTS error + speaking --> idle: playback done + error --> idle: recoverable + error --> wifi_connecting: Wi-Fi lost +``` + +State ownership stays local. The bridge can request high-level state changes, but firmware clamps those to known states and patterns. + +## Wake provider interface + +Wake detection is a replaceable provider: + +```cpp +struct WakeResult { + bool detected; + float confidence; +}; + +class WakeProvider { + public: + void begin(); + WakeResult feed(const int16_t *samples, size_t sample_count); + void setMuted(bool muted); +}; +``` + +Bench firmware uses a serial test trigger or a crude energy trigger so the rest of the robot loop is testable. Product firmware replaces that with ESP-SR WakeNet through ESP-IDF. WakeNet expects 16 kHz mono signed 16-bit audio, so the audio conversion layer is shared by both paths. + +## Buffering + +| Buffer | Size target | Purpose | +| --- | --- | --- | +| Wake frame | 30 ms | Matches WakeNet's feature frame cadence. | +| Pre-roll | 500-1000 ms | Captures the beginning of the user's sentence after wake. | +| Utterance ring | 4-8 seconds | Avoids one large blocking allocation. | +| Playback chunk | 512-2048 bytes | Smooth I2S TX without large RAM spikes. | + +The Nano ESP32 has PSRAM, so a small in-memory WAV is acceptable for the bench prototype. Move to ring-buffered streaming once the bridge and audio path are stable. + +## Bridge protocol + +### Blocking v1 + +The current firmware posts to the Jetson bridge over Wi-Fi/HTTP: + +```http +POST /v1/utterance HTTP/1.1 +Authorization: Bearer +Content-Type: audio/wav +Content-Length: +``` + +The Nano's `THREEBO_BRIDGE_BASE_URL` points at the Jetson on the local network, for example `http://3bo.local:8081` or the Jetson's LAN IP. + +The bridge returns: + +```http +200 OK +Content-Type: audio/wav +Content-Length: +``` + +Return 16 kHz mono signed 16-bit PCM WAV audio. Keep TTS amplitude low; the MAX98357A can overpower a 0.2 W speaker. Reject missing or invalid `Authorization: Bearer` header before running STT, TTS, or Arbiter. Add bridge-side request size limits, per-device rate limits, and a short upload timeout so a LAN client cannot turn the robot into an open compute/audio proxy. + +### USB serial product path + +> **Not yet implemented.** The framing protocol below is the target design for Milestone 4. The current v1 firmware uses the Wi-Fi/HTTP path only. + +The preferred Nano-to-Jetson connection is USB CDC serial over the same USB-C cable that powers the Nano. The Jetson bridge opens the Nano device (usually `/dev/ttyACM0`) and exchanges framed messages. + +```text +magic: "3BO1" +type: 1 byte // state, utterance.chunk, utterance.done, speech.chunk, frame, error +flags: 1 byte +length: uint32 little-endian payload bytes +payload: length bytes +crc32: uint32 little-endian over header + payload +``` + +The `frame` message type carries a JPEG-compressed camera frame from the OV5640. The ESP32-S3 captures frames via its parallel DVP camera peripheral (`esp32-camera` library), JPEG-compresses them, and emits one `frame` message per captured image. The Jetson vision service reads these from the serial port to feed MediaPipe and the `/frame` HTTP endpoint. + +Start with small `state` and telemetry frames. Add utterance chunks once serial framing, retries, and bridge logging are stable. Mirror the HTTP auth concept by pairing the device secret during session setup. + +### Streaming v2 (M4 design, not implemented) + +| Event | Payload | Firmware behavior | +| --- | --- | --- | +| `state` | `{ "state": "thinking" }` | Switch LED pattern. | +| `speech.chunk` | WAV/PCM bytes or URL | Queue playback. | +| `speech.done` | `{}` | Return to idle after queue drains. | +| `error` | `{ "message": "..." }` | Error LED and recovery. | + +Streaming lets the bridge start speaking sentence chunks while Arbiter is still finishing the answer. + +## LED policy + +| State | Pattern | +| --- | --- | +| `boot` | short white fill | +| `wifi_connecting` | blue rotating pixel | +| `idle` | low white breath | +| `wake_detected` | quick white flash | +| `listening` | blue pulse | +| `uploading` | blue chase | +| `thinking` | amber sweep | +| `speaking` | white/green voice meter or pulse | +| `muted` | dim red | +| `error` | red blink | + +Keep maximum brightness low until the Jetson USB-powered 5 V body budget is measured under load. + +## Error handling + +- If Wi-Fi disconnects, stop audio capture and show `wifi_connecting`. +- If the Jetson bridge upload fails, show `error`, wait briefly, then return to idle. +- If playback audio is too large or lacks `Content-Length`, reject it and log the reason. +- If mute becomes active during recording, discard the utterance. +- If mute becomes active during speaking, stop playback when the audio layer supports interruption; for the bench sketch, finish the current blocking playback at low volume. +- Hardware mute must also remove microphone power. Firmware mute handling is a second layer for state and UX, not the privacy boundary. +- If audio init fails, keep the device in `error` and report over serial. + +## Configuration + +Firmware configuration contains only device-local settings: + +- Wi-Fi SSID and password. +- Jetson bridge base URL. +- Device ID. +- Per-device bridge secret. +- LED brightness cap. +- Max recording duration. +- Optional development wake trigger. + +Do not place Arbiter tenant tokens, provider API keys, or bridge admin secrets in firmware. The per-device bridge secret is only a device-pairing credential for the local bridge; rotate it if the firmware image is shared. + +## Development order + +### Completed (M1–M2) + +1. Bring up the Jetson Orin with JetPack, cooling, storage, network, SSH, and system updates. +2. Build Arbiter on the Jetson and run `arbiter --api` on localhost. +3. Run a local STT smoke test on the Jetson with a saved 16 kHz WAV file. +4. Run a local TTS smoke test and confirm output is 16 kHz mono WAV. +5. Flash the Arduino bench firmware with Wi-Fi and LED animation only. +6. Add I2S TX and play a quiet WAV/tone through the MAX98357A. +7. Add I2S RX and print microphone levels. +8. Upload a fixed-duration WAV from the Nano to the Jetson bridge. +9. Have the Jetson bridge return a small WAV and play it. +10. Enforce the bridge shared secret, body-size cap, and per-device rate limit. + +### Remaining (M3–M4) + +11. Add VAD and pre-roll. +12. Port the wake provider to ESP-IDF + ESP-SR WakeNet. +13. Add streaming playback and OTA. + +## Source references + +- Arduino Nano ESP32 documentation: https://docs.arduino.cc/hardware/nano-esp32/ +- Arduino Nano ESP32 store/spec page: https://store.arduino.cc/products/nano-esp32 +- Arduino-ESP32 I2S API: https://docs.espressif.com/projects/arduino-esp32/en/latest/api/i2s.html +- ESP-IDF I2S programming guide: https://docs.espressif.com/projects/esp-idf/en/latest/esp32s3/api-reference/peripherals/i2s.html +- ESP-SR WakeNet documentation: https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html +- NVIDIA Jetson Orin Nano Super Developer Kit: https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/nano-super-developer-kit/ +- whisper.cpp: https://github.com/ggml-org/whisper.cpp diff --git a/examples/3bo/JETSON.md b/examples/3bo/JETSON.md new file mode 100644 index 0000000..d221d9c --- /dev/null +++ b/examples/3bo/JETSON.md @@ -0,0 +1,720 @@ +# 3bo Jetson Orin brain + +The Jetson Orin is 3bo's onboard Linux brain. It runs the software that is too +large or too credential-sensitive for the Nano ESP32: Arbiter, speech-to-text, +text-to-speech, the voice bridge, conversation state, logs, and model storage. + +This document assumes the Jetson Orin Nano Super Developer Kit (JetPack 6, +Ubuntu 22.04, ARM64) unless a different module or carrier is noted. + +--- + +## Role split + +| Layer | Device | Responsibilities | +| --- | --- | --- | +| Body controller | Arduino Nano ESP32 | Wake trigger, I2S mic capture, LEDs, speaker playback, mute switch. Powered over Jetson USB. | +| Local brain | Jetson Orin | Arbiter daemon, Ollama, whisper.cpp STT, Piper TTS, 3bo voice bridge, logs, model storage. | + +The Nano is deterministic and replaceable. The Jetson iterates like a normal +Linux service stack. + +--- + +## Service map + +| Service | Bind address | Port | +| --- | --- | --- | +| Arbiter API | `127.0.0.1` | `8080` | +| Ollama | `127.0.0.1` | `11434` | +| 3bo bridge | `0.0.0.0` | `8081` | +| whisper.cpp | subprocess (no port) | — | +| Piper | subprocess (no port) | — | + +Arbiter and Ollama are loopback-only. The bridge is the only service that +accepts connections from the Nano or the LAN. + +--- + +## Model directory + +Put all model files under `/opt/3bo/models` so every service config points to +one location: + +```sh +sudo mkdir -p /opt/3bo/models +sudo chown $USER /opt/3bo/models +``` + +Recommended layout after all installs: + +``` +/opt/3bo/models/ + ggml-tiny.en.bin whisper.cpp — fast latency test + ggml-base.en.bin whisper.cpp — recommended first target + en_US-amy-low.onnx Piper voice model (16 kHz output) + en_US-amy-low.onnx.json Piper voice config (required alongside .onnx) +``` + +--- + +## 1. Jetson prerequisites + +Flash JetPack 6 from SDK Manager or a prebuilt SD image. Confirm the device +boots, cooling is active, and SSH works before continuing. + +Install build tools and Arbiter's library dependencies in one pass: + +```sh +sudo apt update +sudo apt install -y \ + build-essential cmake git \ + libssl-dev libcurl4-openssl-dev libsqlite3-dev libreadline-dev \ + python3 python3-pip python3-venv +``` + +### mDNS hostname + +The firmware uses `3bo.local` to reach the bridge. Set the Jetson's hostname +and enable the mDNS responder: + +```sh +sudo hostnamectl set-hostname 3bo +sudo apt install -y avahi-daemon +sudo systemctl enable avahi-daemon +sudo systemctl start avahi-daemon +``` + +Confirm from another machine on the same LAN: `ping 3bo.local` + +If you prefer a static IP instead of mDNS, set `THREEBO_BRIDGE_BASE_URL` to the +Jetson's IP address in firmware and skip avahi. + +Confirm CUDA is available (Jetson ships with it in JetPack 6): + +```sh +nvcc --version +``` + +--- + +## 2. Arbiter + +### Build + +```sh +git clone https://github.com/your-org/arbiter ~/arbiter +cd ~/arbiter +cmake -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j$(nproc) +sudo install -m 755 build/arbiter /usr/local/bin/arbiter +``` + +### Provider API keys + +Arbiter reads provider keys from environment variables or files in +`~/.arbiter/`. Set the keys for any providers the robot will use: + +```sh +# Anthropic (cloud agent — required for the index/cloud path) +echo "sk-ant-..." > ~/.arbiter/api_key +chmod 600 ~/.arbiter/api_key + +# OpenAI (optional alternative cloud provider) +# echo "sk-..." > ~/.arbiter/openai_api_key + +# Ollama is keyless — no file needed +``` + +Or export them as environment variables before starting the server: + +```sh +export ANTHROPIC_API_KEY="sk-ant-..." +``` + +### First start + +Run the server once interactively so you can see the admin token: + +```sh +arbiter --api --bind 127.0.0.1 --port 8080 +``` + +On first run Arbiter prints the admin token **once** — save it immediately: + +``` +Admin token (save this — not shown again): + aat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +Stored at: /home/jetson/.arbiter/admin_token (0600) +``` + +The admin token controls tenant provisioning. It is stored at +`~/.arbiter/admin_token` after the first run and reused on every subsequent +start. + +### Provision a tenant token + +Stop the server (`Ctrl-C`), then provision the 3bo tenant: + +```sh +arbiter --add-tenant 3bo +``` + +Output: + +``` +Created tenant #1 (3bo) + + API key (save this — it will not be shown again): + atr_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +``` + +Save the `atr_...` token. This is `THREEBO_ARBITER_TOKEN` for the bridge. The +database stores only a SHA-256 digest; a lost token means running +`arbiter --add-tenant 3bo` again to issue a new one. + +### Restart + +```sh +arbiter --api --bind 127.0.0.1 --port 8080 +``` + +Verify with: + +```sh +curl -s http://127.0.0.1:8080/v1/health +# {"status":"ok"} +``` + +--- + +## 3. Ollama (local fast-path model) + +Ollama provides the local inference backend for simple queries, bypassing the +cloud entirely for arithmetic, time, greetings, and short factual questions. + +### Install + +```sh +curl -fsSL https://ollama.com/install.sh | sh +``` + +The installer creates a `ollama` systemd service that starts automatically. +Ollama listens on `localhost:11434` by default — no configuration needed for +the local case. + +### Pull the fast-path model + +```sh +ollama pull gemma3:4b +``` + +`gemma3:4b` fits in the Jetson Orin Nano's 8 GB and gives first-token latency +around 200–350 ms on quantized weights. For lower latency at some accuracy +cost, `gemma3:2b` is an option. + +Confirm inference works: + +```sh +ollama run gemma3:4b "What is 12 times 8?" +# 96 +``` + +### Arbiter + Ollama + +Arbiter routes `ollama/` requests to `$OLLAMA_HOST` (default +`http://localhost:11434`). No additional configuration is needed when Ollama +and Arbiter run on the same machine. + +--- + +## 4. whisper.cpp (STT) + +### Build with CUDA + +The Jetson's CUDA cores accelerate whisper inference significantly. Build +with `WHISPER_CUDA=ON`: + +```sh +git clone https://github.com/ggml-org/whisper.cpp ~/whisper.cpp +cd ~/whisper.cpp +cmake -B build -DWHISPER_CUDA=ON -DCMAKE_BUILD_TYPE=Release +cmake --build build -j$(nproc) +sudo install -m 755 build/bin/whisper-cli /usr/local/bin/whisper-cli +``` + +If the CUDA build fails, drop the flag for a CPU-only build: + +```sh +cmake -B build -DCMAKE_BUILD_TYPE=Release +``` + +CPU inference is slower (~600–900 ms for `base.en` on a 4 s clip) but +correct. + +### Download models + +whisper.cpp ships a download script for the standard ggml models: + +```sh +cd ~/whisper.cpp +bash models/download-ggml-model.sh tiny.en +bash models/download-ggml-model.sh base.en +cp models/ggml-tiny.en.bin models/ggml-base.en.bin /opt/3bo/models/ +``` + +### Latency reference + +These times are approximate on the Jetson Orin Nano with a 4 s utterance and +the CUDA build: + +| Model | VRAM | Latency | Notes | +| --- | --- | --- | --- | +| `tiny.en` | ~75 MB | ~80–120 ms | Fastest; lower accuracy on noisy input | +| `base.en` | ~145 MB | ~150–250 ms | Recommended starting point | +| `small.en` | ~465 MB | ~400–600 ms | Better accuracy; measure thermals first | + +Start with `base.en`. Move to `tiny.en` if latency tests show the STT step is +dominating; move to `small.en` if recognition accuracy is the bottleneck. + +### Smoke test + +```sh +# Record a short clip on any machine and copy it to the Jetson, or use +# a saved WAV. The file must be 16 kHz mono signed-16-bit PCM. +whisper-cli \ + -m /opt/3bo/models/ggml-base.en.bin \ + -f /path/to/test.wav \ + -nt -l en +``` + +Expected output is the transcript on stdout with no timestamps. + +--- + +## 5. Piper (TTS) + +Piper is a lightweight neural TTS engine that runs well on ARM without a GPU. +Use a `low` quality voice for 16 kHz output that matches the ESP32 playback +pipeline. + +### Install + +```sh +pip3 install piper-tts +``` + +If `pip` resolves an incompatible wheel for your Python version, download the +ARM64 release binary directly from the Piper GitHub releases page and install +it to `/usr/local/bin/piper`. + +### Download the voice model + +Each Piper voice consists of two files: an `.onnx` model and a `.onnx.json` +config. Both must be in the same directory. + +Visit the [Piper voices repository](https://github.com/rhasspy/piper) and +download a `low` quality English voice. `en_US-amy-low` outputs 16 kHz, which +matches the ESP32 firmware's expected sample rate: + +```sh +# Download amy-low (16 kHz output — matches firmware) +cd /opt/3bo/models + +# .onnx model +wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/low/en_US-amy-low.onnx" + +# .onnx.json config (required alongside the model) +wget -q "https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/amy/low/en_US-amy-low.onnx.json" +``` + +`low` quality voices output 16 kHz PCM. `medium` and `high` voices output +22 050 Hz. If you use a higher-quality voice, set `THREEBO_PIPER_SAMPLE_RATE` +to match and confirm the WAV header in the bridge response matches what the +ESP32 expects. + +### Smoke test + +Piper with `--output_raw` writes raw signed-16-bit PCM to stdout. Pipe it +through `aplay` or `sox` to verify the voice sounds correct before wiring it +into the bridge: + +```sh +echo "Hello, I am 3bo." | piper \ + --model /opt/3bo/models/en_US-amy-low.onnx \ + --output_raw | aplay -r 16000 -f S16_LE -c 1 +``` + +--- + +## 6. Bridge + +### Configure environment variables + +Create an env file at `/etc/3bo/bridge.env` (or any secure path): + +```sh +sudo mkdir -p /etc/3bo +sudo tee /etc/3bo/bridge.env > /dev/null <<'EOF' +# Auth — must match THREEBO_DEVICE_SECRET in threebo_config.h +THREEBO_DEVICE_SECRET=replace-with-a-random-secret + +# Arbiter — the atr_... token from `arbiter --add-tenant 3bo` +THREEBO_ARBITER_TOKEN=atr_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + +# STT +THREEBO_WHISPER_MODEL=/opt/3bo/models/ggml-base.en.bin + +# TTS +THREEBO_PIPER_MODEL=/opt/3bo/models/en_US-amy-low.onnx +# THREEBO_PIPER_SAMPLE_RATE=16000 (default; change if using a medium/high voice) + +# Conversation memory — saves conversation_id across bridge restarts +THREEBO_CONVERSATION_FILE=/etc/3bo/conversation.json + +# Agent routing (defaults match what we create below) +# THREEBO_LOCAL_AGENT=local +# THREEBO_CLOUD_AGENT=index +EOF +sudo chmod 600 /etc/3bo/bridge.env +``` + +### Register the local Arbiter agent + +The `local` agent sends simple queries to Ollama instead of the cloud. Create +it once with the admin API. Arbiter must be running: + +```sh +# Load the admin token +ARBITER_ADMIN_TOKEN=$(cat ~/.arbiter/admin_token) + +# Create the local fast-path agent (stored in ~/.arbiter/tenants.db) +curl -s -X POST http://127.0.0.1:8080/v1/agents \ + -H "Authorization: Bearer atr_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \ + -H "Content-Type: application/json" \ + -d '{ + "id": "local", + "role": "quick-responder", + "model": "ollama/gemma3:4b", + "max_tokens": 256, + "temperature": 0.2, + "goal": "Answer simple, short questions in one or two sentences. Be direct and concise. Do not add preamble." + }' +``` + +The `index` agent (cloud) already exists as the default orchestrator. No +additional registration is needed unless you want to override its model or +goal. + +### Run + +```sh +cd /path/to/arbiter/examples/3bo/bridge +source /etc/3bo/bridge.env +python3 bridge.py --host 0.0.0.0 --port 8081 +``` + +### End-to-end smoke test + +Test each stage independently before connecting the Nano. + +**STT only:** + +```sh +source /etc/3bo/bridge.env +curl -s -X POST http://localhost:8081/v1/transcribe \ + -H "Authorization: Bearer $THREEBO_DEVICE_SECRET" \ + -H "Content-Type: audio/wav" \ + --data-binary @/path/to/test.wav +# {"transcript":"what time is it"} +``` + +**Full utterance pipeline:** + +```sh +curl -s -X POST http://localhost:8081/v1/utterance \ + -H "Authorization: Bearer $THREEBO_DEVICE_SECRET" \ + -H "Content-Type: audio/wav" \ + --data-binary @/path/to/test.wav \ + --output response.wav +aplay -r 16000 -f S16_LE -c 1 response.wav +``` + +**Unauthenticated rejection check:** + +```sh +curl -s -o /dev/null -w "%{http_code}" \ + -X POST http://localhost:8081/v1/utterance \ + -H "Content-Type: audio/wav" \ + --data-binary @/path/to/test.wav +# 401 +``` + +--- + +## 7. Systemd services + +### Arbiter + +```sh +sudo tee /etc/systemd/system/arbiter-api.service > /dev/null <<'EOF' +[Unit] +Description=Arbiter API server +After=network.target +Wants=ollama.service + +[Service] +Type=simple +User=jetson +EnvironmentFile=/etc/3bo/bridge.env +Environment=ANTHROPIC_API_KEY=sk-ant-... +ExecStart=/usr/local/bin/arbiter --api --bind 127.0.0.1 --port 8080 +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target +EOF +``` + +Put the Anthropic key in the `Environment=` line above, or add it to +`/etc/3bo/bridge.env` and reference it there. Do not put provider keys in +`~/.arbiter/api_key` when running as a system service under a different user. + +### 3bo bridge + +```sh +sudo tee /etc/systemd/system/3bo-bridge.service > /dev/null <<'EOF' +[Unit] +Description=3bo voice bridge +After=arbiter-api.service ollama.service +Requires=arbiter-api.service + +[Service] +Type=simple +User=jetson +EnvironmentFile=/etc/3bo/bridge.env +WorkingDirectory=/home/jetson/arbiter/examples/3bo/bridge +ExecStart=/usr/bin/python3 bridge.py --host 0.0.0.0 --port 8081 +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target +EOF +``` + +### Enable and start + +```sh +sudo systemctl daemon-reload +sudo systemctl enable ollama arbiter-api 3bo-bridge +sudo systemctl start ollama arbiter-api 3bo-bridge +sudo systemctl status arbiter-api 3bo-bridge +``` + +### Log tailing + +```sh +journalctl -fu arbiter-api +journalctl -fu 3bo-bridge +``` + +--- + +## 8. Performance tuning + +### Jetson power mode + +The Jetson Orin Nano has several power profiles. Use `15W` or higher for +inference work: + +```sh +sudo nvpmodel -m 0 # MAXN (full power) +sudo jetson_clocks # lock CPU/GPU clocks at max +``` + +Check current mode: + +```sh +sudo nvpmodel -q +``` + +For battery-powered deployment, `10W` is a reasonable compromise between +latency and power draw. Measure first-token latency under each mode before +committing to a target. + +### Latency budget + +Approximate end-to-end latency for a typical 4 s utterance on `MAXN` with +`ggml-base.en` and Gemma 3 4B for a local-tier query: + +| Stage | Time | +| --- | --- | +| Wake detection (ESP32) | ~10 ms | +| Audio upload over USB/Wi-Fi | ~50–150 ms | +| STT (`base.en`, CUDA) | ~150–250 ms | +| Complexity classification | < 1 ms | +| Arbiter → Ollama TTFT (`gemma3:4b`) | ~200–350 ms | +| Piper first sentence | ~80–120 ms | +| Audio return to ESP32 | ~30–80 ms | +| **Total (local tier)** | **~520–960 ms** | + +For cloud-tier queries replace the Ollama TTFT row with ~400–700 ms for +Claude, and all other rows remain the same. + +The bridge pipelines Piper synthesis with model generation — sentences 1–N−1 +synthesise while the model generates sentence N — so TTS is not fully +serialised after the model finishes. + +### Whisper model selection + +Switch models by changing `THREEBO_WHISPER_MODEL` and restarting the bridge: + +```sh +# Faster, slightly less accurate +THREEBO_WHISPER_MODEL=/opt/3bo/models/ggml-tiny.en.bin + +# More accurate, slower +THREEBO_WHISPER_MODEL=/opt/3bo/models/ggml-small.en.bin +``` + +Run `/v1/transcribe` tests with representative 3bo utterances to measure +both accuracy and latency before switching in production. + +--- + +## Bring-up checklist + +Run these steps in order. Each depends on the previous. + +### Step 1 — platform + +- [ ] Flash JetPack 6 and boot. +- [ ] Confirm active cooling (`jtop` or `tegrastats`). +- [ ] Enable SSH and confirm network access. +- [ ] Install system packages (`build-essential cmake libssl-dev` …). + +### Step 2 — Arbiter + +- [ ] Build Arbiter and install to `/usr/local/bin/arbiter`. +- [ ] Write provider key to `~/.arbiter/api_key`. +- [ ] Run `arbiter --api` once interactively; save the admin token. +- [ ] Run `arbiter --add-tenant 3bo`; save the `atr_...` token. +- [ ] Confirm `curl http://127.0.0.1:8080/v1/health` returns `{"status":"ok"}`. + +### Step 3 — Ollama + +- [ ] Install Ollama. +- [ ] Pull `gemma3:4b`. +- [ ] Confirm `ollama run gemma3:4b "What is 7 times 6?"` returns `42`. + +### Step 4 — whisper.cpp + +- [ ] Build with `WHISPER_CUDA=ON`. +- [ ] Download `ggml-base.en.bin` to `/opt/3bo/models/`. +- [ ] Run `whisper-cli -m /opt/3bo/models/ggml-base.en.bin -f test.wav -nt` and confirm transcript. + +### Step 5 — Piper + +- [ ] Install `piper-tts`. +- [ ] Download `en_US-amy-low.onnx` and `en_US-amy-low.onnx.json` to `/opt/3bo/models/`. +- [ ] Smoke test: `echo "Hello." | piper --model ... --output_raw | aplay -r 16000 -f S16_LE -c 1`. + +### Step 6 — bridge + +- [ ] Write `/etc/3bo/bridge.env` with all required variables. +- [ ] Register the `local` Arbiter agent via `curl POST /v1/agents`. +- [ ] Start the bridge: `python3 bridge.py --host 0.0.0.0 --port 8081`. +- [ ] Confirm `GET /health` returns `ok`. +- [ ] Confirm unauthenticated `POST /v1/utterance` returns `401`. +- [ ] Run authenticated `/v1/transcribe` test and confirm transcript. +- [ ] Run authenticated `/v1/utterance` test and play back the WAV. + +### Step 7 — hardware loop + +- [ ] Flash firmware with `THREEBO_BRIDGE_BASE_URL` pointing at the Jetson. +- [ ] Send a serial wake (`w` over Serial) from the Arduino IDE monitor. +- [ ] Confirm the Nano records, uploads, receives a WAV, and plays it. +- [ ] Confirm LED states transition: Idle → Wake → Listening → Uploading → Thinking → Speaking → Idle. + +### Step 8 — services + +- [ ] Install `arbiter-api.service` and `3bo-bridge.service` unit files. +- [ ] Enable and start all services. +- [ ] Reboot the Jetson and confirm all services start automatically. +- [ ] Run the hardware loop test again after reboot. + +--- + +## 9. Vision service (Milestone 5 — planned) + +> Not needed for the v1 voice prototype. Complete Milestones 1–8 first. +> Full design specification: [VISION.md](VISION.md). + +### Additional hardware + +The OV5640 camera lives in the robot head and connects to the Arduino Nano +ESP32, not directly to the Jetson. The ESP32-S3 captures frames, JPEG-compresses +them, and forwards them to the Jetson over the existing USB serial link. No +camera hardware connection to the Jetson is required. + +Wire the PCA9685 servo driver to the Jetson 40-pin header: + +| Jetson pin | PCA9685 | +| --- | --- | +| Pin 1 (3.3V) | VCC | +| Pin 2 or 4 (5V) | V+ (servo power) | +| Pin 3 (SDA) | SDA | +| Pin 5 (SCL) | SCL | +| Pin 6 or 9 (GND) | GND | + +Confirm I2C: `sudo i2cdetect -y 1` should show the PCA9685 at address `0x40`. + +Confirm camera: `ls /dev/video*` after connecting, or run +`nvgstcapture-1.0` for a live preview. + +### Additional Python dependencies + +```sh +pip3 install mediapipe opencv-python smbus2 pyserial +``` + +`pyserial` is required for the vision service to read JPEG frames from the +ESP32-S3 over the USB serial port. + +### Additional Ollama model + +```sh +ollama pull moondream +``` + +moondream (~1.6B) handles visual queries ("what do you see?"). It runs +alongside `gemma3:4b` within the Jetson's 8 GB. + +### Bring-up checklist — Milestone 5 + +- [ ] Confirm `i2cdetect -y 1` shows PCA9685 at 0x40. +- [ ] Flash ESP32-S3 firmware with OV5640 camera capture and `frame` USB serial message support. +- [ ] Confirm Jetson sees JPEG frames arriving on the USB serial port from the ESP32-S3. +- [ ] Install Python deps: `mediapipe opencv-python smbus2 pyserial`. +- [ ] Pull moondream: `ollama pull moondream`. +- [ ] Run `vision_service.py`; confirm `GET /health` responds. +- [ ] Confirm `GET /face` returns a centroid when a face is in frame. +- [ ] Confirm `POST /track {"enabled": true}` moves servos toward face. +- [ ] Confirm `POST /rest` returns head to (0°, −5°). +- [ ] Run a visual query through the bridge: ask "what do you see?" and + confirm the response references the scene. +- [ ] Install `3bo-vision.service` unit and enable it alongside the other + services. + +--- + +## Source references + +- NVIDIA Jetson Orin Nano Super Developer Kit: https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/nano-super-developer-kit/ +- whisper.cpp: https://github.com/ggml-org/whisper.cpp +- Piper TTS: https://github.com/rhasspy/piper +- Ollama: https://ollama.com +- MediaPipe: https://developers.google.com/mediapipe +- Piper voices (Hugging Face): https://huggingface.co/rhasspy/piper-voices diff --git a/examples/3bo/README.md b/examples/3bo/README.md new file mode 100644 index 0000000..ce40304 --- /dev/null +++ b/examples/3bo/README.md @@ -0,0 +1,177 @@ +# 3bo + +Small stationary robot. An Arduino Nano ESP32 handles the physical interface: +I2S microphone capture, wake-word detection, LED states, and speaker playback. +A Jetson Orin Nano runs whisper.cpp for STT, Piper for TTS, Arbiter for +reasoning, and the voice bridge that connects them. The Nano connects to the +Jetson over a single USB-C cable that carries both power and serial data. + +Cloud provider keys and Arbiter tenant tokens never leave the Jetson. The +firmware holds only a per-device bridge secret used as a Bearer token. + +The v1 build has no servos or camera. Build servo pockets into the v1 enclosure +for the M5 upgrade path. See [VISION.md](VISION.md) for the full design. + +## Documents + +| File | Contents | +| --- | --- | +| `BOM.md` | Component list with part numbers and cost estimates | +| `CIRCUIT.md` | Bench wiring tables and bring-up order | +| `FIRMWARE.md` | Arduino firmware architecture, state machine, and bridge protocol | +| `JETSON.md` | Step-by-step Jetson setup: Arbiter, Ollama, whisper.cpp, Piper, bridge | +| `VISION.md` | M5 design spec: USB camera, pan/tilt neck, face tracking, VLM queries | +| `bridge/` | 3bo bridge launcher and hardware bring-up stub | +| `firmware/arduino/` | Arduino bench firmware sketch | + +## System architecture + +```mermaid +flowchart LR + mic["I2S microphone"] --> firmware["3bo firmware"] + firmware --> wake["Local wake word"] + wake --> recorder["Utterance recorder"] + recorder --> jetson["Jetson Orin brain"] + jetson --> bridge["3bo voice bridge"] + bridge --> stt["whisper.cpp STT"] + stt --> arbiter["Arbiter API"] + arbiter --> bridge + bridge --> tts["Piper TTS"] + tts --> firmware + firmware --> speaker["I2S amp + speaker"] + firmware --> leds["Status LEDs"] +``` + +**M5 addition** — USB camera and vision service on the Jetson; pan/tilt neck +and motorised base driven by PCA9685 over I2C. + +```mermaid +flowchart LR + cam["OV5640 camera (head)"] --> firmware["3bo firmware"] + firmware --> vision["Vision service"] + vision --> pca["PCA9685 servo driver"] + pca --> pan["Pan servo"] + pca --> tilt["Tilt servo"] + vision --> bridge + mic["I2S microphone"] --> firmware["3bo firmware"] + firmware --> wake["Local wake word"] + wake --> recorder["Utterance recorder"] + recorder --> jetson["Jetson Orin brain"] + jetson --> bridge["3bo voice bridge"] + bridge --> stt["whisper.cpp STT"] + stt --> arbiter["Arbiter API"] + arbiter --> bridge + bridge --> tts["Piper TTS"] + tts --> firmware + firmware --> speaker["I2S amp + speaker"] + firmware --> leds["Status LEDs"] +``` + +## Hardware + +| Part | Example | Purpose | +| --- | --- | --- | +| Local brain | NVIDIA Jetson Orin Nano Super Developer Kit | Bridge, Arbiter, STT, TTS, storage | +| Controller | Arduino Nano ESP32 | Wake word, I2S mic/amp, LEDs, mute | +| Microphone | Adafruit ICS-43434 I2S MEMS mic breakout, product 6049 | Digital mono speech capture | +| Amplifier | Adafruit MAX98357A I2S amplifier, product 3006 | Speaker playback | +| Speaker | Adafruit 8 ohm 0.2 W mini speaker, product 1898 | Bring-up audio output | +| LEDs | Adafruit NeoPixel Stick 8 × RGBW, product 2869 | Robot state display | +| Mute | DPDT switch + P-channel MOSFET or load switch | Hard microphone power cutoff | + +Power rules: +- Jetson: vendor-recommended supply or a dedicated regulator on the 19 V input rail. +- Nano ESP32: powered from the Jetson USB host port only. Do not connect `VIN`. +- LEDs and amp: 5 V USB-powered load. Stay within the verified current budget. +- Microphone: 3.3 V only. Route through the mute switch so mute cuts mic power. + +See [BOM.md](BOM.md) for the full parts list and [CIRCUIT.md](CIRCUIT.md) for wiring. + +## Robot states + +| State | LED | Audio | Head (M5 only) | +| --- | --- | --- | --- | +| `idle` | Soft white breath | Wake-word listening | Rest pose (0°, −5°) | +| `wake_detected` | Quick white flash | Optional earcon | — | +| `scanning` | Amber sweep | Silent | Pan ±45° searching for face | +| `listening` | Blue pulse | Recording | Tracking on locked face | +| `uploading` | Blue chase | Optional tick | Tracking | +| `thinking` | Amber sweep | Silent | Tracking | +| `speaking` | White/green meter | TTS playback | Tracking | +| `error` | Red blink | Short error phrase | Rest pose | +| `muted` | Dim red | Wake-word disabled | Rest pose | + +`scanning` and tracking are M5 additions. The v1 state machine has no +`scanning` state and no servo output. + +## LED patterns + +| Pattern | Trigger | +| --- | --- | +| `breath_idle` | Online, wake-word active | +| `wake_flash` | Wake word detected | +| `listen_pulse` | Recording utterance | +| `upload_chase` | Uploading to bridge | +| `think_sweep` | Waiting for response | +| `speak_meter` | Speaking, brightness tracks audio level | +| `error_blink` | Any failure | +| `muted_dim` | Microphone muted | + +## Privacy + +- Mute switch cuts microphone 3.3 V through a hardware pole or load switch. +- Firmware mute is a state signal only — privacy depends on the hardware path. +- Arbiter tokens and provider keys stay on the Jetson; firmware stores only the per-device bridge secret. +- Audio is not sent to the bridge before wake-word detection triggers. +- The bridge enforces auth, body-size limits, and per-IP rate limits before invoking STT, TTS, or Arbiter. + +## Prototype milestones + +### Milestone 1: Software loop + +See [JETSON.md](JETSON.md) for the full step-by-step setup. + +- Bring up Jetson with JetPack 6, cooling, and SSH. +- Build Arbiter; provision a tenant token; confirm `/v1/health`. +- Install Ollama; pull `gemma3:4b`; register the `local` agent. +- Build whisper.cpp with CUDA; download `ggml-base.en.bin`. +- Install Piper; download `en_US-amy-low.onnx`. +- Run bridge stub; confirm auth rejection and canned WAV playback with the Nano. +- Switch to `bridge.py`; confirm `/v1/transcribe` and `/v1/utterance` return correct output. + +### Milestone 2: Audio and LED loop + +- Connect I2S microphone and speaker. +- Connect NeoPixel stick. +- Record a fixed-length utterance and upload to the bridge. +- Play back the TTS WAV response. +- Verify LED states for each robot state. + +### Milestone 3: Wake word + +- Add ESP-SR WakeNet or equivalent on-device wake-word engine. +- Replace fixed recording with wake-triggered recording. +- Add VAD silence detection and max-duration cutoff. +- Wire the hard microphone mute switch. + +### Milestone 4: Product hardening + +- USB CDC serial framing between Nano and Jetson (replacing Wi-Fi fallback). +- OTA firmware update path. +- Bridge-side diagnostic logs. +- Recovery behavior for Wi-Fi, STT, TTS, and Arbiter failures. + +### Milestone 5: Vision and head tracking + +See [VISION.md](VISION.md) for the full design. + +- USB camera on head tier; vision service on Jetson. +- PCA9685 over Jetson I2C; pan/tilt MG90S servos; N20 base motor with DRV8833 and slip ring. +- Face detection at 20–30 fps; two-stage PID tracking (head pan fine, base yaw coarse). +- `scanning` state on wake: ±45° pan sweep until face locked. +- Visual query path: `needs_vision()` → capture frame → moondream2 → prepend to Arbiter message. + +## Open decisions + +- Wake word engine: ESP-SR WakeNet, Picovoice Porcupine, or custom TinyML. +- Scanning idle (M5): static rest pose vs. slow ambient scan — measure under load before deciding. diff --git a/examples/3bo/VISION.md b/examples/3bo/VISION.md new file mode 100644 index 0000000..5e1f93b --- /dev/null +++ b/examples/3bo/VISION.md @@ -0,0 +1,488 @@ +# 3bo Vision and Head-Tracking System + +> **MILESTONE 5 — FUTURE DESIGN ONLY** +> Nothing in this document is implemented in v1. The v1 prototype has no camera, +> no servos, and no PCA9685. This document is a design specification for a +> future milestone. Do not treat any section as actionable guidance until +> Milestone 5 begins. + +--- + +## Overview + +This document describes the planned camera and head-tracking system for 3bo. + +3bo detects a human face on wake and orients toward the user throughout the conversation. The neck uses a three-arm differential mechanism: a passive ball-joint pivot at the rear of the head and two servo-driven push/pull rods at the front. Both servos together control pitch (nod); differential servo motion controls roll (head cock). All horizontal tracking is handled by the motorised base. The head camera enables visual queries — the user can ask what 3bo sees and the response draws on a live frame passed through a local VLM. + +The system adds a USB webcam on the head tier, a PCA9685 I2C PWM driver on the Jetson, two MG90S micro-servos driving push/pull rods from the neck base to the head, an N20 gearmotor with encoder for base rotation, and a slip ring at the base joint for continuous 360° rotation. +A background vision service runs on the Jetson and exposes a small localhost +HTTP API consumed by the bridge. + +--- + +## Hardware (Planned) + +### Component Table + +| Component | Part | Notes | +|---|---|---| +| Camera | Adafruit OV5640 Camera Breakout — 72° Lens with Autofocus, product 5945 | Mounted in robot head on custom carrier board. 8-bit parallel DVP to ESP32-S3 camera peripheral. ESP32-S3 JPEG-compresses frames and forwards them to Jetson over USB serial. | +| PWM driver | PCA9685 16-channel I2C servo driver | I2C address 0x40; connected to Jetson 40-pin header I2C bus | +| Servo L | MG90S metal-gear micro servo | PCA9685 channel 0. Left push/pull rod. | +| Servo R | MG90S metal-gear micro servo | PCA9685 channel 1. Right push/pull rod. | +| Base motor | N20 gearmotor with quadrature encoder, 6 V, 100–200 RPM | DRV8833 H-bridge driver; IN1/IN2 from PCA9685 channels 2/3. Encoder A/B to Jetson GPIO. | +| Base bearing | Lazy Susan ball bearing, 100–150 mm | Supports full body weight through 360° rotation | +| Slip ring | 12-wire capsule slip ring, ≥ 2 A/circuit | Passes 19 V supply, 5 V body rail, motor control, and encoder signals through the rotating base joint | + +### Camera Specifications + +| Property | Value | +|---|---| +| Sensor | OV5640, 5 MP | +| Interface | 8-bit parallel DVP to ESP32-S3 camera peripheral; I2C (SCCB) for autofocus control | +| Resolution | VGA (640×480) or higher via `esp32-camera`; JPEG output | +| Horizontal FOV | 72° (non-distorting lens) | +| XCLK | Internal 24 MHz oscillator on breakout (enable via jumper) | +| Capture pipeline | `esp32-camera` on ESP32-S3; JPEG frames forwarded to Jetson over USB serial | + +### PCA9685 Wiring (Planned) + +| Signal | Jetson 40-pin header pins | +|---|---| +| I2C SDA | Pin 3 | +| I2C SCL | Pin 5 | +| VCC (logic) | Pin 1 (3.3 V) | +| GND | Any GND pin | +| V+ (servo power) | Pins 2 or 4 (5 V, shared rail — see Power section) | + +### Servo PWM Parameters + +| Parameter | Value | +|---|---| +| PWM frequency | 50 Hz | +| Minimum pulse width | 500 µs | +| Maximum pulse width | 2400 µs | +| Channel 0 | Servo L (left push/pull rod) | +| Channel 1 | Servo R (right push/pull rod) | + +### Neck Mechanism + +Three-arm differential design. The head is connected to the neck at three +points: one passive rear pivot and two servo-driven push/pull rods at the +front. + +**Connection points** + +| Point | Type | Position | Notes | +|---|---|---|---| +| Back pivot | Ball joint (M3 rod-end or printed socket) | Rear centre of head, at head CG height | Passive — provides the reaction point. Ball joint allows small compliance to prevent binding during combined pitch+roll. | +| Left rod | M3 threaded rod with ball-link ends | Front-left of head, 35 mm left of centreline | Driven by Servo L. | +| Right rod | M3 threaded rod with ball-link ends | Front-right of head, 35 mm right of centreline | Driven by Servo R. | + +**Geometry constraints** + +| Dimension | Value | Notes | +|---|---|---| +| Rod attachment width | 35 mm (centre-to-centre) | Narrower = more pitch authority relative to roll. Calibrate after first print. | +| Back pivot height | At head CG | Head CG must be measured with all components installed. | +| Rod angle at neutral | ~perpendicular to head front face | Maximises mechanical advantage at the midpoint of travel. | +| Servo horn radius | 15 mm (starting point) | Adjust to tune travel range vs. torque. | +| Hard stops | ±32° pitch, ±17° roll | 2° mechanical margin beyond software limits. | + +**Servo command mixing** + +Pitch (nod) and roll (head-cock) are computed from the two servo positions: + +``` +pitch = (servo_L + servo_R) / 2 +roll = (servo_L - servo_R) / 2 +``` + +To command a desired pitch and roll: + +``` +servo_L = pitch_cmd + roll_cmd +servo_R = pitch_cmd - roll_cmd +``` + +Both servo outputs are clamped to hardware travel limits before being written +to the PCA9685. During normal tracking, `roll_cmd = 0` and both servos move +identically. + +**Servo mounting** + +Both servos mount at the base of the neck (body side), not inside the head. +This keeps the head's moment of inertia low for faster PID response. The +push/pull rods run up through or alongside the neck tube to the head +attachment points. + +--- + +## Range of Motion (Planned) + +### Angle Limits + +| Axis | Actuator | Command | Range | Hard stops | +|---|---|---|---|---| +| Pitch (head nod) | Servo L + Servo R together | `pitch_cmd` | ±30° | Yes, mechanical at ±32° | +| Roll (head cock) | Servo L vs Servo R differential | `roll_cmd` | ±15° | Yes, mechanical at ±17° | +| Yaw (base) | N20 + DRV8833 | base yaw command | 360° continuous | None — encoder-tracked in software | + +The head has no pan axis. All horizontal tracking is handled by base yaw. +Positive pitch = head tips up. Positive roll = head cocks right. Positive yaw = +clockwise viewed from above. Pitch and roll are clamped in software before +servo mixing; hard stops are a backup. + +### Named Positions + +| Position | Yaw | Pitch | Roll | Description | +|---|---|---|---|---| +| `home` | current | 0° | 0° | Head level, centred. Base holds position. | +| `rest` | current | -5° | 0° | Slight downward pitch toward seated user. Default between conversations. | +| `scan_start` | -45° | -5° | 0° | Base yaw at left edge of scan sweep. Head at rest pitch. | +| `scan_end` | +45° | -5° | 0° | Base yaw at right edge of scan sweep. Head at rest pitch. | + +Base yaw is not reset on idle — it holds the last oriented position. + +### Scan Pattern + +On wake the base sweeps from -45° to +45° yaw at ~20°/s while the head holds +rest pitch (-5° pitch, 0° roll). The sweep aborts as soon as a face is detected +or after 2.5 s. If no face is found within the cap, listening begins at whatever +yaw position the sweep reached. + +The head servos do not move during the scan sweep — only the base rotates. + +--- + +## Vision Service Design (Planned) + +> This section describes the intended design of `vision_service.py`, a +> background process that will run on the Jetson. No code is written yet. + +### Responsibilities + +The vision service will: + +1. Read incoming JPEG frames from the USB serial port (forwarded by the ESP32-S3 from the OV5640). +2. Run MediaPipe FaceDetector (full-range model, `model_selection=1`) on each + frame at approximately 20–30 fps. +3. Maintain the current tracking state (face centroid, confidence, pitch, roll, + yaw, servo L/R angles). +4. Run a PID position-control loop to convert face centroid error into pitch + and yaw commands; apply servo mixing for servo L and servo R. +5. Write servo L/R pulse widths to the PCA9685 over I2C; write base yaw + commands to the DRV8833 via PCA9685 channels 2–3. +6. Serve a small localhost HTTP API so the bridge can query state and issue + control commands. + +### PID Control Loop Design + +The face centroid is expressed in normalized image coordinates where (0.5, 0.5) +is the center of the frame. + +| Variable | Definition | +|---|---| +| `error_x` | `centroid_x − 0.5` (positive = face is right of centre) | +| `error_y` | `centroid_y − 0.5` (positive = face is below centre) | +| `yaw_correction` | `PID(error_x) × fov_h` — sent to base yaw motor | +| `pitch_correction` | `PID(error_y) × fov_v` — applied via servo mixing | + +`fov_h` and `fov_v` are the camera's horizontal and vertical field of view in +degrees; measure per chosen webcam model. + +Servo mixing applies pitch correction with roll held at zero during tracking: + +``` +servo_L_cmd = pitch_correction + 0 (roll = 0 during tracking) +servo_R_cmd = pitch_correction - 0 +``` + +All commanded values are clamped to hardware limits before output. When face +detection confidence falls below threshold or no face is present, the PID +integrators are frozen and all actuators hold their last commanded position. + +### Tracking Strategy + +The head has no pan axis. Horizontal and vertical tracking use separate +actuators with no interaction between loops: + +| Axis | Actuator | PID input | Speed | +|---|---|---|---| +| Horizontal | Base yaw (N20 motor) | `error_x` | ~20–40°/s | +| Vertical | Head pitch (differential servo) | `error_y` | ~60°/s max slew | +| Roll | Differential servo | Not used during tracking (roll = 0) | — | + +Because the base handles all horizontal correction and the head handles all +vertical correction, there is no two-stage interaction or cross-axis dependency +to manage. Each PID loop is independent. + +### Localhost HTTP API + +The vision service will expose the following endpoints on localhost (port TBD): + +#### GET /face + +Returns current face tracking state. + +| Field | Type | Description | +|---|---|---| +| `x` | float | Normalized face centroid X (0.0–1.0) | +| `y` | float | Normalized face centroid Y (0.0–1.0) | +| `confidence` | float | Face detection confidence (0.0–1.0) | +| `pitch_deg` | float | Current head pitch command in degrees | +| `roll_deg` | float | Current head roll command in degrees | +| `yaw_deg` | float | Current base yaw position in degrees (encoder-derived) | +| `servo_l_deg` | float | Current Servo L pulse position in degrees | +| `servo_r_deg` | float | Current Servo R pulse position in degrees | + +Example: `{"x":0.52,"y":0.41,"confidence":0.94,"pitch_deg":-4.1,"roll_deg":0.0,"yaw_deg":12.3,"servo_l_deg":-4.1,"servo_r_deg":-4.1}` + +#### GET /frame + +Returns the latest JPEG frame received from the ESP32-S3. Used by the bridge when a +visual query is needed. + +Response: `image/jpeg` binary body. + +#### POST /track + +Enables or disables servo output from the PID loop. + +Request body: + +| Field | Type | Description | +|---|---|---| +| `enabled` | bool | `true` to start tracking, `false` to hold position | + +When disabled, the servos hold the last commanded position. The `home` and +`rest` commands below work regardless of tracking state. + +#### POST /home + +Drives head to pitch=0°, roll=0° (servo_L=0°, servo_R=0°) immediately. +Ignores tracking state. + +No request body required. + +#### POST /rest + +Drives head to pitch=-5°, roll=0° (servo_L=-5°, servo_R=-5°) immediately. +Ignores tracking state. + +No request body required. + +#### GET /health + +Liveness check. Returns 200 OK if the capture pipeline is running and the +PCA9685 is reachable. + +--- + +## Robot State Additions (Planned) + +Two new state concepts are planned for Milestone 5. They extend the existing +state table in `README.md` without replacing it. + +### New State: `scanning` + +| Property | Value | +|---|---| +| Trigger | Wake event received | +| Behavior | Base yaw sweeps from -45° to +45° at ~20°/s; head holds rest pitch (-5°, roll 0°) | +| Transition out | Face locked (→ `listening` + tracking active) or 2.5 s timeout (→ `listening`, no lock) | +| LED | Same as `wake_detected` → `listening` — no additional LED pattern needed | +| Duration cap | 2.5 s | + +The scanning state runs during the transition from wake detection to listening. +If a face is found within the cap, tracking activates and the conversation +proceeds normally. If no face is found within 2.5 s, listening begins anyway +with the head at whatever position the sweep reached. + +### New Mode Flag: `tracking` + +Tracking is not a standalone state — it is a concurrent mode flag that can be +active during `listening`, `thinking`, and `speaking` states. + +| Property | Value | +|---|---| +| Activated | When a face is locked during scanning | +| Deactivated | When the robot returns to idle | +| Effect | PID loop drives servos each frame to keep face centered | +| LED | No change — underlying conversation state LEDs remain in effect | + +Because tracking is a background mode flag rather than a foreground state, no +new LED pattern is needed for it. The head simply moves while the existing +conversation LED patterns play. + +--- + +## Bridge Integration (Planned) + +> This section describes planned changes to +> `examples/3bo/bridge/bridge.py` and the generic bridge at +> `examples/voice-bridge/bridge.py`. No code is written yet. + +### Vision Service Base URL + +The bridge will read a `THREEBO_VISION_URL` environment variable (default: +`http://127.0.0.1:PORT`). All vision API calls go to that base. + +### Wake Event Handler (Planned) + +When the bridge receives a wake event: + +1. Call `POST /vision/track` with `{"enabled": true}`. +2. Begin the servo scan sweep by driving servos to `scan_start` position and + issuing incremental angle commands toward `scan_end` at 20°/s. +3. Poll `GET /vision/face` each sweep step. If `confidence` exceeds a threshold + (TBD, e.g. 0.85), stop sweep and let PID loop take over. +4. After lock or 2.5 s timeout, transition to `listening`. + +### Idle Return Handler (Planned) + +When the bridge returns to idle after a conversation: + +1. Call `POST /vision/rest` to return the head to the rest pose. +2. Call `POST /vision/track` with `{"enabled": false}`. + +### Visual Keyword Detection (Planned) + +Before forwarding a transcript to Arbiter, the bridge will run a +`needs_vision(transcript)` check. + +#### Trigger Keywords + +| Keyword or phrase | +|---| +| see | +| seeing | +| look | +| looking | +| show | +| in front of you | +| around you | +| what is that | +| describe | +| notice | + +#### Visual Query Pipeline + +When `needs_vision` returns true: + +1. Call `GET /vision/frame` to retrieve the latest JPEG. +2. Base64-encode the frame. +3. Call the Ollama `/api/generate` endpoint with model `moondream` and the + encoded frame as the image input. +4. Use the prompt: `"Describe what you see concisely in two sentences."` +5. Prepend the model's response to the Arbiter message as: + `[Visual context: ]` +6. Route the message to the cloud agent regardless of complexity classification + (visual context requires a full model response; the local fast-path agent + should not receive image-derived context). + +#### moondream2 Model Details + +| Property | Value | +|---|---| +| Model | moondream2 | +| Size | ~1.6 B parameters | +| Ollama name | `moondream` | +| Pull command | `ollama pull moondream` | +| API | Standard Ollama `/api/generate` with base64 image field | +| Prompt | "Describe what you see concisely in two sentences." | + +--- + +## Latency Notes (Planned) + +These are expected latency figures based on hardware specifications. Actual +values will need to be measured during Milestone 5 integration. + +| Operation | Expected latency | +|---|---| +| Face detection loop | ~33 ms per frame at ~30 fps | +| PID update | Each frame (~33 ms interval) | +| moondream2 frame query on Jetson Orin Nano | ~1.5–3 s | +| MG90S servo slew (rest to typical face angle ~30°) | ~50 ms | +| Scan to face lock (face within ±45°, 20°/s sweep) | 0–4.5 s | +| Scan timeout fallback | 2.5 s cap | + +The 2.5 s scan cap keeps wake-to-listening latency bounded at a level that +feels acceptable even when no face is present. The VLM query latency (1.5–3 s) +is additive to the normal STT and Arbiter latency for visual queries; that +budget should be communicated to users if possible (e.g. an extended thinking +LED phase). + +--- + +## Power Notes (Planned) + +### Servo Current Budget + +| Condition | Current per servo | Total (2 servos) | +|---|---|---| +| Idle / holding position | ~50 mA | ~100 mA | +| Active movement | ~150 mA (typical) | ~300 mA | +| Stall (hard limit) | ~250 mA | ~500 mA | + +### Supply Rail Plan + +| Rail | Source | Load | +|---|---|---| +| Servo V+ | Jetson 40-pin 5 V (pins 2 and 4) | PCA9685 V+ → Servo L, Servo R | +| Motor VM | Same 5 V rail → DRV8833 VM | N20 base motor (100–300 mA typical) | +| PCA9685 logic VCC | Jetson 40-pin 3.3 V (pin 1) | PCA9685 logic only | +| Encoder VCC | Base 3.3 V rail | N20 encoder logic | +| LED and audio 5 V | Existing USB/Nano body rail | NeoPixel, MAX98357A amp | + +Peak draw with both servos slewing and base motor running: ~700 mA on the 5 V +rail. The Jetson 40-pin 5 V header is rated ~3 A — sufficient with margin. +Add bulk capacitance (220–470 µF) near both the PCA9685 V+ terminal and the +DRV8833 VM pin to absorb simultaneous inrush from servos and motor start. + +The Jetson 40-pin 5 V rail (pins 2 and 4) can supply up to approximately 3 A, +which comfortably covers two MG90S servos at peak draw with margin remaining +for other 5 V peripherals. + +### Rail Isolation + +Keep servo V+ on the PCA9685 board separate from the logic body rail serving +LEDs and audio. Servo PWM noise and inrush current during slew should not +affect the audio amplifier or NeoPixel power path. Add bulk capacitance (100– +470 µF) near the PCA9685 V+ terminal to absorb slew inrush. + +Do not exceed the Jetson header 5 V current rating. If a future build adds more +servos, a dedicated 5 V regulator fed from the main battery rail is the +recommended path — not additional draws on the header pins. + +--- + +## Milestone Placement + +| Milestone | Status | Scope | +|---|---|---| +| Milestone 1 | Planned | Software loop: Jetson, Arbiter, STT, TTS, bridge | +| Milestone 2 | Planned | Audio and LED loop: I2S mic, speaker, NeoPixel | +| Milestone 3 | Planned | Wake word: on-device detection, VAD, mute switch | +| Milestone 4 | Planned | Product hardening: pairing, persistence, OTA, recovery | +| **Milestone 5** | **This document** | **Vision and head-tracking: camera, servos, VLM queries** | + +### v1 Prototype Constraints + +The v1 prototype (Milestones 1–4) has no camera, no PCA9685, and no servos. +None of the hardware described in this document is installed in v1. + +The neck bracket design should include servo pocket cutouts and mounting holes +sized for MG90S servos so that Milestone 5 hardware can be installed without +a major mechanical rebuild. Pockets should be left empty in v1, with the servo +wire channels sealed against debris. + +The vision service (`vision_service.py`) does not run in v1. Its dependencies +(MediaPipe, OpenCV, `smbus2`, PCA9685 library) are not installed in v1. + +--- + +*Last updated: 2026-06-12. This is a design document. All content describes +planned future work for Milestone 5 and does not reflect the current state of +the 3bo prototype.* diff --git a/examples/3bo/bridge/README.md b/examples/3bo/bridge/README.md new file mode 100644 index 0000000..601788c --- /dev/null +++ b/examples/3bo/bridge/README.md @@ -0,0 +1,64 @@ +# 3bo bridge + +Two bridge scripts live here: + +| File | Purpose | +| --- | --- | +| `bridge_stub.py` | Minimal bring-up stub — proves the Nano HTTP contract without running STT, Arbiter, or TTS. Returns a canned WAV. | +| `bridge.py` | Thin launcher — maps `THREEBO_*` env vars to the generic voice bridge and execs it. | + +The full pipeline (STT → classify → Arbiter → TTS) lives in +[`examples/voice-bridge/bridge.py`](../../voice-bridge/bridge.py). + +--- + +## Bring-up order + +### Step 1 — hardware contract (bridge_stub.py) + +Use the stub to verify the Nano↔Jetson HTTP contract before adding real +inference to the stack. + +```sh +THREEBO_DEVICE_SECRET='replace-with-random-secret' \ +python3 bridge_stub.py --host 0.0.0.0 --port 8081 +``` + +Point the firmware `THREEBO_BRIDGE_BASE_URL` at `http://:8081`. +Confirm the Nano uploads audio and plays back the canned response. + +### Step 2 — full pipeline (bridge.py) + +Once the hardware loop works, switch to the real bridge. + +```sh +THREEBO_DEVICE_SECRET='replace-with-random-secret' \ +THREEBO_ARBITER_TOKEN='atr_...' \ +THREEBO_WHISPER_MODEL='/opt/3bo/models/ggml-base.en.bin' \ +THREEBO_PIPER_MODEL='/opt/3bo/models/en_US-amy-low.onnx' \ +python3 bridge.py --host 0.0.0.0 --port 8081 +``` + +The real bridge uses `/v1/utterance` and `Authorization: Bearer `, not the stub's `/device/:id/utterance` and `X-3bo-Device-Secret`. The sketch already uses the correct endpoint and header. Rebuild and reflash the firmware before switching from the stub. + +After reflashing, update `THREEBO_BRIDGE_BASE_URL` in `threebo_config.h` to point at the Jetson (e.g. `http://3bo.local:8081` requires avahi-daemon on the Jetson, or use the LAN IP directly). + +--- + +## Environment variables + +| 3bo variable | Maps to | Required | +| --- | --- | --- | +| `THREEBO_DEVICE_SECRET` | `BRIDGE_API_KEY` | yes | +| `THREEBO_ARBITER_TOKEN` | `ARBITER_TOKEN` | yes | +| `THREEBO_WHISPER_MODEL` | `WHISPER_MODEL` | yes | +| `THREEBO_PIPER_MODEL` | `PIPER_MODEL` | yes | +| `THREEBO_ARBITER_URL` | `ARBITER_URL` | default `http://127.0.0.1:8080` | +| `THREEBO_WHISPER_BIN` | `WHISPER_BIN` | default `whisper-cli` | +| `THREEBO_PIPER_BIN` | `PIPER_BIN` | default `piper` | +| `THREEBO_PIPER_SAMPLE_RATE` | `PIPER_SAMPLE_RATE` | default `16000` | +| `THREEBO_LOCAL_AGENT` | `ARBITER_LOCAL_AGENT` | default `local` | +| `THREEBO_CLOUD_AGENT` | `ARBITER_CLOUD_AGENT` | default `index` | + +See [`examples/voice-bridge/bridge.py`](../../voice-bridge/bridge.py) for the +full list of generic options. diff --git a/examples/3bo/bridge/bridge.py b/examples/3bo/bridge/bridge.py new file mode 100644 index 0000000..544dd30 --- /dev/null +++ b/examples/3bo/bridge/bridge.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +"""3bo bridge launcher. + +Maps 3bo-specific env vars to the generic voice bridge and runs it. +The full pipeline (STT → classify → memory → Arbiter → TTS) lives in +examples/voice-bridge/bridge.py. + +Required: + THREEBO_DEVICE_SECRET used as BRIDGE_API_KEY (Authorization: Bearer) + THREEBO_ARBITER_TOKEN used as ARBITER_TOKEN + THREEBO_WHISPER_MODEL used as WHISPER_MODEL + THREEBO_PIPER_MODEL used as PIPER_MODEL + +Optional: + THREEBO_ARBITER_URL maps to ARBITER_URL (default http://127.0.0.1:8080) + THREEBO_WHISPER_BIN maps to WHISPER_BIN (default whisper-cli) + THREEBO_PIPER_BIN maps to PIPER_BIN (default piper) + THREEBO_PIPER_SAMPLE_RATE maps to PIPER_SAMPLE_RATE (default 16000) + THREEBO_LOCAL_AGENT maps to ARBITER_LOCAL_AGENT (default local) + THREEBO_CLOUD_AGENT maps to ARBITER_CLOUD_AGENT (default index) + THREEBO_CONVERSATION_FILE maps to BRIDGE_CONVERSATION_FILE + Path to persist conversation_id across restarts. + Recommended: /etc/3bo/conversation.json + When set, cloud-tier turns have persistent memory. + Say "forget everything" or "start fresh" to reset. + +The Nano ESP32 firmware sends POST /v1/utterance with: + Authorization: Bearer +The firmware source uses THREEBO_BRIDGE_BASE_URL from threebo_config.h. +The stub used /device/:id/utterance — make sure threebo_config.h points at +the real bridge URL and that firmware has been rebuilt after that change. +""" + +import os +import sys + +_MAP = { + "BRIDGE_API_KEY": "THREEBO_DEVICE_SECRET", + "ARBITER_TOKEN": "THREEBO_ARBITER_TOKEN", + "WHISPER_MODEL": "THREEBO_WHISPER_MODEL", + "PIPER_MODEL": "THREEBO_PIPER_MODEL", + "ARBITER_URL": "THREEBO_ARBITER_URL", + "WHISPER_BIN": "THREEBO_WHISPER_BIN", + "PIPER_BIN": "THREEBO_PIPER_BIN", + "PIPER_SAMPLE_RATE": "THREEBO_PIPER_SAMPLE_RATE", + "ARBITER_LOCAL_AGENT": "THREEBO_LOCAL_AGENT", + "ARBITER_CLOUD_AGENT": "THREEBO_CLOUD_AGENT", + "BRIDGE_CONVERSATION_FILE": "THREEBO_CONVERSATION_FILE", +} + +for generic, threebo in _MAP.items(): + if threebo in os.environ and generic not in os.environ: + os.environ[generic] = os.environ[threebo] + +_here = os.path.dirname(os.path.abspath(__file__)) +_generic = os.path.join(_here, "..", "..", "voice-bridge", "bridge.py") + +if not os.path.exists(_generic): + sys.exit(f"generic bridge not found at {_generic}") + +os.execv(sys.executable, [sys.executable, _generic] + sys.argv[1:]) diff --git a/examples/3bo/bridge/bridge_stub.py b/examples/3bo/bridge/bridge_stub.py new file mode 100644 index 0000000..5730b29 --- /dev/null +++ b/examples/3bo/bridge/bridge_stub.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Minimal 3bo Jetson bridge contract stub. + +This accepts the Nano ESP32 bench firmware's WAV upload and returns a small +16 kHz mono WAV. It does not run STT, Arbiter, or TTS yet; it exists to verify +networking, auth, upload caps, response playback, and LED state transitions. +""" + +from __future__ import annotations + +import argparse +import math +import os +import struct +import time +import wave +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from io import BytesIO + + +MAX_UPLOAD_BYTES = 512 * 1024 +RATE_WINDOW_SECONDS = 60 +RATE_MAX_REQUESTS = 12 + +REQUEST_TIMES: dict[str, list[float]] = {} + + +def make_test_wav() -> bytes: + sample_rate = 16_000 + duration_seconds = 0.45 + frequency_hz = 660 + frames = int(sample_rate * duration_seconds) + + pcm = bytearray() + for i in range(frames): + envelope = min(1.0, i / 800, (frames - i) / 1200) + sample = int(4500 * envelope * math.sin(2 * math.pi * frequency_hz * i / sample_rate)) + pcm.extend(struct.pack(" None: + parts = [p for p in self.path.split("/") if p] + if len(parts) != 3 or parts[0] != "device" or parts[2] != "utterance": + self.send_error(404, "not found") + return + + device_id = parts[1] + if not self._authorized(): + self.send_error(401, "missing or invalid device secret") + return + + if not self._within_rate_limit(device_id): + self.send_error(429, "device rate limit exceeded") + return + + content_length = self.headers.get("Content-Length") + try: + length = int(content_length or "0") + except ValueError: + self.send_error(400, "invalid content length") + return + + if length <= 0: + self.send_error(400, "empty utterance") + return + if length > MAX_UPLOAD_BYTES: + self.send_error(413, "utterance too large") + return + + body = self.rfile.read(length) + if len(body) != length: + self.send_error(400, "short upload") + return + + print(f"device={device_id} wav_bytes={len(body)} sample_rate={self.headers.get('X-Sample-Rate')}") + + self.send_response(200) + self.send_header("Content-Type", "audio/wav") + self.send_header("Content-Length", str(len(TEST_WAV))) + self.send_header("Connection", "close") + self.end_headers() + self.wfile.write(TEST_WAV) + + def do_GET(self) -> None: + if self.path == "/health": + body = b"ok\n" + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + return + self.send_error(404, "not found") + + def _authorized(self) -> bool: + expected = self.server.device_secret # type: ignore[attr-defined] + got = self.headers.get("X-3bo-Device-Secret", "") + return bool(expected) and got == expected + + def _within_rate_limit(self, device_id: str) -> bool: + now = time.monotonic() + recent = [t for t in REQUEST_TIMES.get(device_id, []) if now - t < RATE_WINDOW_SECONDS] + if len(recent) >= RATE_MAX_REQUESTS: + REQUEST_TIMES[device_id] = recent + return False + recent.append(now) + REQUEST_TIMES[device_id] = recent + return True + + def log_message(self, fmt: str, *args: object) -> None: + print("%s - %s" % (self.address_string(), fmt % args)) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=8081) + args = parser.parse_args() + + secret = os.environ.get("THREEBO_DEVICE_SECRET", "") + if not secret: + raise SystemExit("set THREEBO_DEVICE_SECRET before starting the bridge stub") + + server = ThreadingHTTPServer((args.host, args.port), Handler) + server.device_secret = secret # type: ignore[attr-defined] + print(f"3bo bridge stub listening on http://{args.host}:{args.port}") + server.serve_forever() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/3bo/firmware/README.md b/examples/3bo/firmware/README.md new file mode 100644 index 0000000..8192907 --- /dev/null +++ b/examples/3bo/firmware/README.md @@ -0,0 +1,65 @@ +# 3bo firmware + +This directory contains the first firmware scaffold for 3bo. + +## Arduino bench firmware + +`arduino/threebo_nano_esp32/threebo_nano_esp32.ino` is the bring-up firmware for +the current prototype hardware. It is intended to prove: + +- Wi-Fi connection to the local bridge. +- Mute switch behavior. +- RGBW NeoPixel state animations. +- I2S microphone recording. +- WAV upload to the bridge. +- WAV response playback through the MAX98357A amplifier. + +It does not implement the final keyword detector. It includes serial and +optional energy-based development triggers so the complete turn loop can be +tested before the ESP-SR wake provider is ported. + +## Expected bridge behavior + +The preferred product link is USB CDC serial over the same USB-C cable that +powers the Nano from the Jetson. The current Arduino bench sketch still uses +Wi-Fi/HTTP so the audio loop can be tested quickly before the serial framing +layer is implemented. + +The Wi-Fi/HTTP fallback sketch posts a WAV file: + +```http +POST /v1/utterance HTTP/1.1 +Authorization: Bearer +Content-Type: audio/wav +Content-Length: +``` + +The Jetson-hosted bridge should return a small 16 kHz mono signed 16-bit +`audio/wav` response with `Content-Length`. The firmware stores the response in +memory and plays it through I2S. + +The bridge rejects missing or invalid `Authorization: Bearer` headers before +invoking STT, TTS, or Arbiter. The secret is a local device-pairing credential +only; keep Arbiter tenant tokens and provider keys on the Jetson. + +## Arduino setup + +Install these libraries in the Arduino IDE: + +- Arduino ESP32 board support for Arduino Nano ESP32. +- Adafruit NeoPixel. + +Create a `threebo_config.h` next to the sketch using +`threebo_config.example.h` as the template. Keep Arbiter credentials and provider +keys out of this file; only the bridge should store those. Generate a random +`THREEBO_DEVICE_SECRET` and configure the same value in the Jetson bridge. + +The sketch controls the MAX98357A shutdown pin on `D8`. If the amplifier +shutdown pin is tied high in hardware instead, leave `D8` unconnected or remove +the shutdown writes from the sketch. + +## Production firmware + +The production firmware moves to ESP-IDF with ESP-SR WakeNet/AFE. Keep +the same bridge contract and LED state names so the higher-level 3bo behavior +does not change when the wake provider is swapped in. diff --git a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h new file mode 100644 index 0000000..6dc089c --- /dev/null +++ b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h @@ -0,0 +1,28 @@ +#pragma once + +// Local network settings for the Wi-Fi/HTTP fallback bench sketch. +// The preferred product link is USB CDC serial over the Jetson USB cable. +constexpr char THREEBO_WIFI_SSID[] = "your-wifi-ssid"; +constexpr char THREEBO_WIFI_PASSWORD[] = "your-wifi-password"; + +// Jetson bridge URL. The firmware posts to THREEBO_BRIDGE_BASE_URL/v1/utterance. +// Use the Jetson's mDNS name (requires avahi-daemon on the Jetson) or its LAN IP. +// Example with mDNS: "http://3bo.local:8081" +// Example with IP: "http://192.168.1.42:8081" +constexpr char THREEBO_BRIDGE_BASE_URL[] = "http://3bo.local:8081"; + +// Per-device shared secret. Must match THREEBO_DEVICE_SECRET on the Jetson. +// Sent as: Authorization: Bearer +// Generate a random value, e.g.: openssl rand -hex 32 +constexpr char THREEBO_DEVICE_SECRET[] = "replace-with-random-device-secret"; + +// Development wake triggers. These are not the final keyword detector. +constexpr bool THREEBO_ENABLE_SERIAL_WAKE = true; // Send 'w' over Serial. +constexpr bool THREEBO_ENABLE_ENERGY_WAKE = false; +constexpr int32_t THREEBO_ENERGY_WAKE_THRESHOLD = 1200; + +// Keep first tests gentle for the 0.2 W speaker and 8-pixel LED stick. +constexpr uint8_t THREEBO_LED_BRIGHTNESS = 28; +constexpr uint8_t THREEBO_RECORD_SECONDS = 4; +constexpr size_t THREEBO_MAX_RESPONSE_WAV_BYTES = 512 * 1024; +constexpr uint32_t THREEBO_HTTP_TIMEOUT_MS = 30000; diff --git a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino new file mode 100644 index 0000000..c4c07fc --- /dev/null +++ b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino @@ -0,0 +1,551 @@ +#include +#include +#include +#include +#include +#include + +#include "threebo_config.h" + +constexpr int PIN_I2S_BCLK = D2; +constexpr int PIN_I2S_WS = D3; +constexpr int PIN_I2S_MIC = D4; +constexpr int PIN_I2S_AMP = D5; +constexpr int PIN_PIXELS = D6; +constexpr int PIN_MUTE = D7; +constexpr int PIN_AMP_SD = D8; + +constexpr uint32_t SAMPLE_RATE_HZ = 16000; +constexpr uint16_t PIXEL_COUNT = 8; +constexpr size_t WAV_HEADER_BYTES = 44; +constexpr size_t AUDIO_CHUNK_BYTES = 512; +constexpr uint32_t WIFI_RETRY_INTERVAL_MS = 5000; +constexpr uint32_t ERROR_HOLD_MS = 1500; + +Adafruit_NeoPixel pixels(PIXEL_COUNT, PIN_PIXELS, NEO_GRBW + NEO_KHZ800); +I2SClass Audio; + +enum class RobotState : uint8_t { + Boot, + WifiConnecting, + Idle, + WakeDetected, + Listening, + Uploading, + Thinking, + Speaking, + Muted, + Error +}; + +RobotState state = RobotState::Boot; +uint32_t state_started_ms = 0; +uint32_t last_wifi_attempt_ms = 0; +uint32_t last_energy_wake_ms = 0; +bool audio_rx_ready = false; + +size_t min_size(size_t a, size_t b) { + return a < b ? a : b; +} + +const char *state_name(RobotState s) { + switch (s) { + case RobotState::Boot: return "boot"; + case RobotState::WifiConnecting: return "wifi_connecting"; + case RobotState::Idle: return "idle"; + case RobotState::WakeDetected: return "wake_detected"; + case RobotState::Listening: return "listening"; + case RobotState::Uploading: return "uploading"; + case RobotState::Thinking: return "thinking"; + case RobotState::Speaking: return "speaking"; + case RobotState::Muted: return "muted"; + case RobotState::Error: return "error"; + } + return "unknown"; +} + +void set_state(RobotState next) { + if (state == next) return; + state = next; + state_started_ms = millis(); + Serial.print("state="); + Serial.println(state_name(state)); +} + +bool is_muted() { + return digitalRead(PIN_MUTE) == LOW; +} + +void set_all(uint32_t color) { + for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { + pixels.setPixelColor(i, color); + } + pixels.show(); +} + +void animate_leds() { + const uint32_t now = millis(); + const uint32_t t = now - state_started_ms; + + switch (state) { + case RobotState::Boot: + set_all(pixels.Color(0, 0, 0, 24)); + break; + + case RobotState::WifiConnecting: { + pixels.clear(); + const uint16_t active = (now / 120) % PIXEL_COUNT; + pixels.setPixelColor(active, pixels.Color(0, 0, 48, 0)); + pixels.show(); + break; + } + + case RobotState::Idle: { + const uint8_t phase = (now / 28) % 80; + const uint8_t triangle = phase < 40 ? phase : 79 - phase; + const uint8_t white = 2 + triangle / 3; + set_all(pixels.Color(0, 0, 0, white)); + break; + } + + case RobotState::WakeDetected: + set_all(pixels.Color(0, 0, 0, t < 140 ? 80 : 24)); + break; + + case RobotState::Listening: { + const uint8_t phase = (now / 22) % 90; + const uint8_t triangle = phase < 45 ? phase : 89 - phase; + set_all(pixels.Color(0, 0, 20 + triangle, 0)); + break; + } + + case RobotState::Uploading: { + pixels.clear(); + const uint16_t active = (now / 80) % PIXEL_COUNT; + for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { + const uint8_t level = i == active ? 52 : 5; + pixels.setPixelColor(i, pixels.Color(0, 0, level, 0)); + } + pixels.show(); + break; + } + + case RobotState::Thinking: { + pixels.clear(); + const uint16_t active = (now / 100) % PIXEL_COUNT; + for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { + if (i == active) { + pixels.setPixelColor(i, pixels.Color(45, 24, 0, 0)); + } else { + pixels.setPixelColor(i, pixels.Color(4, 2, 0, 0)); + } + } + pixels.show(); + break; + } + + case RobotState::Speaking: { + const uint8_t phase = (now / 18) % 60; + const uint8_t triangle = phase < 30 ? phase : 59 - phase; + set_all(pixels.Color(0, 18 + triangle, 8, 8 + triangle / 2)); + break; + } + + case RobotState::Muted: + set_all(pixels.Color(22, 0, 0, 0)); + break; + + case RobotState::Error: + set_all((now / 180) % 2 == 0 ? pixels.Color(60, 0, 0, 0) + : pixels.Color(0, 0, 0, 0)); + break; + } +} + +uint8_t *alloc_audio_buffer(size_t bytes) { + uint8_t *buffer = + static_cast(heap_caps_malloc(bytes, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)); + if (!buffer) { + buffer = static_cast(malloc(bytes)); + } + return buffer; +} + +void put_u16_le(uint8_t *p, uint16_t v) { + p[0] = static_cast(v & 0xff); + p[1] = static_cast((v >> 8) & 0xff); +} + +void put_u32_le(uint8_t *p, uint32_t v) { + p[0] = static_cast(v & 0xff); + p[1] = static_cast((v >> 8) & 0xff); + p[2] = static_cast((v >> 16) & 0xff); + p[3] = static_cast((v >> 24) & 0xff); +} + +void write_wav_header(uint8_t *wav, uint32_t pcm_bytes) { + memcpy(wav + 0, "RIFF", 4); + put_u32_le(wav + 4, 36 + pcm_bytes); + memcpy(wav + 8, "WAVE", 4); + memcpy(wav + 12, "fmt ", 4); + put_u32_le(wav + 16, 16); + put_u16_le(wav + 20, 1); + put_u16_le(wav + 22, 1); + put_u32_le(wav + 24, SAMPLE_RATE_HZ); + put_u32_le(wav + 28, SAMPLE_RATE_HZ * 2); + put_u16_le(wav + 32, 2); + put_u16_le(wav + 34, 16); + memcpy(wav + 36, "data", 4); + put_u32_le(wav + 40, pcm_bytes); +} + +bool begin_audio_rx() { + Audio.end(); + digitalWrite(PIN_AMP_SD, LOW); + delay(10); + Audio.setPins(PIN_I2S_BCLK, PIN_I2S_WS, -1, PIN_I2S_MIC); + + if (!Audio.begin(I2S_MODE_STD, SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_32BIT, + I2S_SLOT_MODE_MONO, I2S_STD_SLOT_LEFT)) { + Serial.println("I2S RX init failed"); + audio_rx_ready = false; + return false; + } + + if (!Audio.configureRX(SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_32BIT, + I2S_SLOT_MODE_MONO, I2S_RX_TRANSFORM_32_TO_16, + I2S_STD_SLOT_LEFT)) { + Serial.println("I2S RX transform failed"); + audio_rx_ready = false; + return false; + } + + audio_rx_ready = true; + return true; +} + +bool begin_audio_tx() { + Audio.end(); + digitalWrite(PIN_AMP_SD, HIGH); + delay(10); + Audio.setPins(PIN_I2S_BCLK, PIN_I2S_WS, PIN_I2S_AMP, -1); + + if (!Audio.begin(I2S_MODE_STD, SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_16BIT, + I2S_SLOT_MODE_MONO, I2S_STD_SLOT_BOTH)) { + Serial.println("I2S TX init failed"); + return false; + } + + return true; +} + +bool connect_wifi() { + if (WiFi.status() == WL_CONNECTED) return true; + + const uint32_t now = millis(); + if (now - last_wifi_attempt_ms < WIFI_RETRY_INTERVAL_MS) return false; + last_wifi_attempt_ms = now; + + set_state(RobotState::WifiConnecting); + WiFi.mode(WIFI_STA); + WiFi.begin(THREEBO_WIFI_SSID, THREEBO_WIFI_PASSWORD); + + const uint32_t started = millis(); + while (WiFi.status() != WL_CONNECTED && millis() - started < 12000) { + animate_leds(); + delay(25); + } + + if (WiFi.status() == WL_CONNECTED) { + Serial.print("ip="); + Serial.println(WiFi.localIP()); + return true; + } + + Serial.println("Wi-Fi connection failed"); + return false; +} + +bool serial_wake_requested() { + if (!THREEBO_ENABLE_SERIAL_WAKE) return false; + + while (Serial.available() > 0) { + const char c = static_cast(Serial.read()); + if (c == 'w' || c == 'W') { + return true; + } + } + return false; +} + +bool energy_wake_detected() { + if (!THREEBO_ENABLE_ENERGY_WAKE || !audio_rx_ready) return false; + if (millis() - last_energy_wake_ms < 2500) return false; + + int16_t samples[128]; + const size_t wanted = sizeof(samples); + const size_t got = Audio.readBytes(reinterpret_cast(samples), wanted); + if (got < wanted) return false; + + int64_t sum = 0; + const size_t sample_count = got / sizeof(int16_t); + for (size_t i = 0; i < sample_count; ++i) { + const int32_t sample = samples[i]; + sum += sample < 0 ? -sample : sample; + } + + const int32_t avg = static_cast(sum / sample_count); + if (avg > THREEBO_ENERGY_WAKE_THRESHOLD) { + last_energy_wake_ms = millis(); + Serial.print("energy_wake avg="); + Serial.println(avg); + return true; + } + + return false; +} + +bool wake_detected() { + return serial_wake_requested() || energy_wake_detected(); +} + +uint8_t *record_utterance_wav(size_t *out_len) { + *out_len = 0; + if (!audio_rx_ready && !begin_audio_rx()) return nullptr; + + const uint32_t seconds = THREEBO_RECORD_SECONDS > 0 ? THREEBO_RECORD_SECONDS : 1; + const size_t max_pcm_bytes = seconds * SAMPLE_RATE_HZ * sizeof(int16_t); + uint8_t *wav = alloc_audio_buffer(WAV_HEADER_BYTES + max_pcm_bytes); + if (!wav) { + Serial.println("audio allocation failed"); + return nullptr; + } + + write_wav_header(wav, 0); + + size_t written = 0; + uint8_t *pcm = wav + WAV_HEADER_BYTES; + const uint32_t started = millis(); + + while (written < max_pcm_bytes && !is_muted()) { + const size_t remaining = max_pcm_bytes - written; + const size_t chunk = min_size(AUDIO_CHUNK_BYTES, remaining); + const size_t got = Audio.readBytes(reinterpret_cast(pcm + written), chunk); + + if (got > 0) { + written += got; + } else { + delay(1); + } + + animate_leds(); + + if (millis() - started > (seconds * 1000UL + 500UL)) { + break; + } + } + + write_wav_header(wav, written); + *out_len = WAV_HEADER_BYTES + written; + Serial.print("recorded_wav_bytes="); + Serial.println(*out_len); + return wav; +} + +bool read_response_body(HTTPClient &http, uint8_t **out, size_t *out_len) { + *out = nullptr; + *out_len = 0; + + const int length = http.getSize(); + if (length <= 0) { + Serial.println("bridge response needs Content-Length"); + return false; + } + if (static_cast(length) > THREEBO_MAX_RESPONSE_WAV_BYTES) { + Serial.println("bridge response too large"); + return false; + } + + uint8_t *body = alloc_audio_buffer(static_cast(length)); + if (!body) { + Serial.println("response allocation failed"); + return false; + } + + WiFiClient *stream = http.getStreamPtr(); + size_t read_total = 0; + const uint32_t started = millis(); + + while (read_total < static_cast(length) && + millis() - started < THREEBO_HTTP_TIMEOUT_MS) { + const int available = stream->available(); + if (available > 0) { + const size_t chunk = + min_size(static_cast(available), static_cast(length) - read_total); + const size_t got = stream->readBytes(reinterpret_cast(body + read_total), chunk); + read_total += got; + } else { + animate_leds(); + delay(5); + } + } + + if (read_total != static_cast(length)) { + free(body); + Serial.println("bridge response read timed out"); + return false; + } + + *out = body; + *out_len = read_total; + return true; +} + +bool upload_utterance_and_play_response(uint8_t *wav, size_t wav_len) { + set_state(RobotState::Uploading); + animate_leds(); + + WiFiClient client; + HTTPClient http; + const String url = String(THREEBO_BRIDGE_BASE_URL) + "/v1/utterance"; + + if (!http.begin(client, url)) { + Serial.println("HTTP begin failed"); + return false; + } + + http.setTimeout(THREEBO_HTTP_TIMEOUT_MS); + http.addHeader("Content-Type", "audio/wav"); + http.addHeader("Accept", "audio/wav"); + http.addHeader("X-Sample-Rate", String(SAMPLE_RATE_HZ)); + http.addHeader("X-Device-State", "listening"); + http.addHeader("Authorization", String("Bearer ") + THREEBO_DEVICE_SECRET); + + const int status = http.POST(wav, wav_len); + free(wav); + wav = nullptr; + + if (status != HTTP_CODE_OK) { + Serial.print("bridge status="); + Serial.println(status); + http.end(); + return false; + } + + set_state(RobotState::Thinking); + uint8_t *response = nullptr; + size_t response_len = 0; + const bool read_ok = read_response_body(http, &response, &response_len); + http.end(); + + if (!read_ok) return false; + + set_state(RobotState::Speaking); + if (!begin_audio_tx()) { + free(response); + return false; + } + + Audio.playWAV(response, response_len); + Audio.end(); + free(response); + return true; +} + +void handle_turn() { + set_state(RobotState::WakeDetected); + const uint32_t flash_started = millis(); + while (millis() - flash_started < 250) { + animate_leds(); + delay(10); + } + + if (is_muted()) return; + + set_state(RobotState::Listening); + size_t wav_len = 0; + uint8_t *wav = record_utterance_wav(&wav_len); + + Audio.end(); + audio_rx_ready = false; + + if (!wav || wav_len <= WAV_HEADER_BYTES || is_muted()) { + if (wav) free(wav); + Serial.println("utterance discarded"); + return; + } + + const bool ok = upload_utterance_and_play_response(wav, wav_len); + if (!ok) { + set_state(RobotState::Error); + const uint32_t error_started = millis(); + while (millis() - error_started < ERROR_HOLD_MS) { + animate_leds(); + delay(20); + } + } + + begin_audio_rx(); + set_state(is_muted() ? RobotState::Muted : RobotState::Idle); +} + +void setup() { + Serial.begin(115200); + delay(300); + + pinMode(PIN_MUTE, INPUT_PULLUP); + pinMode(PIN_AMP_SD, OUTPUT); + digitalWrite(PIN_AMP_SD, LOW); + + pixels.begin(); + pixels.setBrightness(THREEBO_LED_BRIGHTNESS); + pixels.clear(); + pixels.show(); + + set_state(RobotState::Boot); + animate_leds(); + + connect_wifi(); + if (!begin_audio_rx()) { + set_state(RobotState::Error); + } else { + set_state(is_muted() ? RobotState::Muted : RobotState::Idle); + } + + Serial.println("3bo ready. Send 'w' over Serial for a development wake."); +} + +void loop() { + animate_leds(); + + if (!connect_wifi()) { + delay(25); + return; + } + + if (state == RobotState::WifiConnecting) { + set_state(RobotState::Idle); + } + + if (!audio_rx_ready && state == RobotState::Idle && !begin_audio_rx()) { + set_state(RobotState::Error); + delay(25); + return; + } + + if (is_muted()) { + set_state(RobotState::Muted); + delay(25); + return; + } + + if (state == RobotState::Muted) { + set_state(RobotState::Idle); + } + + if (state == RobotState::Idle && wake_detected()) { + handle_turn(); + } + + delay(5); +} diff --git a/examples/voice-bridge/README.md b/examples/voice-bridge/README.md new file mode 100644 index 0000000..29882cb --- /dev/null +++ b/examples/voice-bridge/README.md @@ -0,0 +1,130 @@ +# Arbiter voice bridge + +A generic bridge between any audio-capable device and a locally-running +Arbiter API, using whisper.cpp for speech-to-text and Piper for +text-to-speech. + +``` +device (WAV upload) + → whisper.cpp (STT) + → classifier (local vs cloud, ~0 ms, rule-based) + → Arbiter SSE (local Ollama agent or cloud agent) + → Piper (TTS, sentence-concurrent with generation) + → device (WAV response) +``` + +## Requirements + +| Tool | Install | +| --- | --- | +| whisper.cpp | Build from source: `cmake -B build && cmake --build build -j$(nproc)` | +| Piper | `pip install piper-tts` or download a release binary | +| Arbiter | `arbiter --api --bind 127.0.0.1 --port 8080` | +| Ollama (optional) | For the local fast-path agent | + +## Quick start + +```sh +# Pull a local model for simple queries (optional but recommended) +ollama pull gemma3:4b + +# Start Arbiter +arbiter --api & + +# Register the local agent +curl -sX POST http://127.0.0.1:8080/v1/agents \ + -H "Authorization: Bearer $ARBITER_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"id":"local","model":"ollama/gemma3:4b","max_tokens":256, + "goal":"Answer simple questions concisely in one or two sentences."}' + +# Start the bridge +ARBITER_TOKEN='atr_...' \ +WHISPER_MODEL='/path/to/ggml-base.en.bin' \ +PIPER_MODEL='/path/to/en_US-amy-low.onnx' \ +BRIDGE_API_KEY='your-secret' \ +python3 bridge.py --port 8081 +``` + +## Endpoints + +| Method | Path | Description | +| --- | --- | --- | +| `POST` | `/v1/utterance` | WAV in → WAV out (main device endpoint) | +| `POST` | `/v1/transcribe` | WAV in → `{"transcript":"..."}` (debug/latency test) | +| `GET` | `/health` | `200 ok` liveness check | + +### POST /v1/utterance + +Upload 16 kHz mono signed-16-bit WAV, receive a WAV response. + +```http +POST /v1/utterance HTTP/1.1 +Authorization: Bearer +Content-Type: audio/wav +Content-Length: +X-Complexity-Hint: local (optional — override the classifier) +``` + +Response: `audio/wav` with `Content-Length`. + +### POST /v1/transcribe + +Same upload format; returns JSON instead of audio. Useful for measuring +STT latency before wiring the full pipeline. + +```json +{"transcript": "what time is it"} +``` + +## Environment variables + +| Variable | Required | Default | Description | +| --- | --- | --- | --- | +| `ARBITER_TOKEN` | yes | — | Arbiter bearer token (`atr_...`) | +| `WHISPER_MODEL` | yes | — | Path to whisper.cpp ggml model file | +| `PIPER_MODEL` | yes | — | Path to Piper `.onnx` voice model | +| `ARBITER_URL` | no | `http://127.0.0.1:8080` | Arbiter API base URL | +| `ARBITER_LOCAL_AGENT` | no | `local` | Agent ID for simple queries | +| `ARBITER_CLOUD_AGENT` | no | `index` | Agent ID for complex queries | +| `WHISPER_BIN` | no | `whisper-cli` | whisper.cpp binary name or path | +| `PIPER_BIN` | no | `piper` | Piper binary name or path | +| `PIPER_SAMPLE_RATE` | no | `16000` | Must match the Piper model's output rate | +| `BRIDGE_API_KEY` | no | — | If set, require `Authorization: Bearer ` | +| `BRIDGE_MAX_BYTES` | no | `524288` | Max upload size in bytes | +| `BRIDGE_RATE_LIMIT` | no | `20` | Max requests per source IP per 60 s | + +If `BRIDGE_API_KEY` is unset, the bridge warns at startup and binds to +`127.0.0.1` only, regardless of `--host`. + +## Complexity routing + +The bridge classifies each transcript with a fast rule-based heuristic +(~0 ms, no model call) and routes to one of two Arbiter agents: + +| Tier | Examples | Agent | +| --- | --- | --- | +| `local` | arithmetic, short time/date queries, greetings | `ARBITER_LOCAL_AGENT` | +| `cloud` | multi-sentence reasoning, planning, open-ended | `ARBITER_CLOUD_AGENT` | + +Pass `X-Complexity-Hint: local` or `X-Complexity-Hint: cloud` to bypass +the classifier from the device side. + +## Latency pipeline + +The bridge pipelines Arbiter text generation with Piper synthesis: each +sentence is submitted to Piper as soon as it completes, while the model +is still generating subsequent sentences. For a four-sentence response, +sentences 1–3 synthesise while the model finishes sentence 4, so the +bottleneck is `max(model_time, last_sentence_tts_time)` rather than +`model_time + total_tts_time`. + +The response is returned only after all synthesis completes (firmware +needs `Content-Length` up front). + +## Hardware examples + +- **3bo robot** — see [`examples/3bo/bridge/`](../3bo/bridge/) for the + thin `THREEBO_*` env-var shim that maps 3bo config to this bridge. +- Any ESP32, Raspberry Pi, or other device that can POST a WAV over HTTP + and play a WAV response works without modification. diff --git a/examples/voice-bridge/bridge.py b/examples/voice-bridge/bridge.py new file mode 100644 index 0000000..c8f9998 --- /dev/null +++ b/examples/voice-bridge/bridge.py @@ -0,0 +1,743 @@ +#!/usr/bin/env python3 +"""Arbiter voice bridge — generic whisper.cpp + Piper hardware bridge. + +Accepts a WAV upload from any device, runs local STT, routes to Arbiter, +synthesises the response with Piper, and returns a WAV. Designed to sit +between any audio-capable microcontroller and a locally-running Arbiter API. + +Pipeline per request: + 1. Auth check, body-size cap, per-IP rate limit. + 2. Transcribe uploaded WAV with whisper.cpp. + 3. Classify transcript: reset command, local tier, or cloud tier. + 4. Local tier → stateless /v1/orchestrate (fast, no memory). + 5. Cloud tier → /v1/conversations/:id/messages (persistent memory). + 6. Synthesise each sentence with Piper as text arrives (overlaps generation). + 7. Concatenate PCM → WAV header → return with Content-Length. + +Endpoints: + POST /v1/utterance WAV upload → WAV response (main device endpoint) + POST /v1/transcribe WAV upload → JSON {"transcript": "..."} (debug/test) + GET /health 200 ok\\n + +Required environment variables: + ARBITER_TOKEN Arbiter bearer token (atr_...) + WHISPER_MODEL path to whisper.cpp ggml model file + PIPER_MODEL path to Piper .onnx voice model file + +Optional environment variables: + ARBITER_URL default http://127.0.0.1:8080 + ARBITER_LOCAL_AGENT default local (Ollama-backed fast agent) + ARBITER_CLOUD_AGENT default index (cloud model, owns the conversation) + WHISPER_BIN default whisper-cli + PIPER_BIN default piper + PIPER_SAMPLE_RATE default 16000 (must match the Piper model's output rate) + BRIDGE_API_KEY if set, requests must carry Authorization: Bearer + if unset, the bridge warns and binds to loopback only + BRIDGE_MAX_BYTES max upload bytes, default 524288 (512 KB) + BRIDGE_RATE_LIMIT max requests per IP per 60 s, default 20 + BRIDGE_CONVERSATION_FILE path to persist conversation state across restarts + (e.g. /etc/3bo/conversation.json) + if unset, cloud turns are stateless (no memory) + +Memory model: + Simple/local queries (arithmetic, time, greetings) are routed stateless to + the local Ollama agent — they do not benefit from history and the round-trip + would add latency. + + Complex/cloud queries are sent through a persistent Arbiter conversation so + the agent remembers prior turns. The conversation_id is saved to + BRIDGE_CONVERSATION_FILE and survives bridge restarts. + + Say "forget everything", "start fresh", "reset", or similar to start a new + conversation. The bridge creates a fresh conversation and acknowledges. + +Arbiter agent setup (one-time): + arbiter --api & + + # Fast local agent (Ollama must be running) + curl -sX POST http://127.0.0.1:8080/v1/agents \\ + -H "Authorization: Bearer $ARBITER_TOKEN" \\ + -H "Content-Type: application/json" \\ + -d '{"id":"local","model":"ollama/gemma3:4b","max_tokens":256, + "goal":"Answer simple questions concisely in one or two sentences."}' +""" + +from __future__ import annotations + +import argparse +import http.client +import json +import logging +import math +import os +import re +import struct +import subprocess +import tempfile +import threading +import time +import uuid +from concurrent.futures import Future, ThreadPoolExecutor +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Iterator +from urllib.parse import urlparse + + +# ────────────────────────────────────────────────────────────────────────────── +# Configuration +# ────────────────────────────────────────────────────────────────────────────── + +def _require(name: str) -> str: + v = os.environ.get(name, "").strip() + if not v: + raise SystemExit(f"required env var {name!r} is not set") + return v + + +def _opt(name: str, fallback: str) -> str: + return os.environ.get(name, fallback).strip() or fallback + + +ARBITER_TOKEN = _require("ARBITER_TOKEN") +WHISPER_MODEL = _require("WHISPER_MODEL") +PIPER_MODEL = _require("PIPER_MODEL") + +ARBITER_URL = _opt("ARBITER_URL", "http://127.0.0.1:8080") +ARBITER_LOCAL = _opt("ARBITER_LOCAL_AGENT", "local") +ARBITER_CLOUD = _opt("ARBITER_CLOUD_AGENT", "index") +WHISPER_BIN = _opt("WHISPER_BIN", "whisper-cli") +PIPER_BIN = _opt("PIPER_BIN", "piper") +PIPER_SAMPLE_RATE = int(_opt("PIPER_SAMPLE_RATE", "16000")) +BRIDGE_API_KEY = os.environ.get("BRIDGE_API_KEY", "").strip() +BRIDGE_MAX_BYTES = int(_opt("BRIDGE_MAX_BYTES", "524288")) +BRIDGE_RATE_LIMIT = int(_opt("BRIDGE_RATE_LIMIT", "20")) +BRIDGE_CONV_FILE = os.environ.get("BRIDGE_CONVERSATION_FILE", "").strip() + +_parsed = urlparse(ARBITER_URL) +ARBITER_HOST = _parsed.hostname or "127.0.0.1" +ARBITER_PORT = _parsed.port or 8080 + +_RATE_WINDOW = 60 + +_rate_lock: threading.Lock = threading.Lock() +_rate_table: dict[str, list[float]] = {} + +log = logging.getLogger("voice-bridge") + + +# ────────────────────────────────────────────────────────────────────────────── +# WAV helpers +# ────────────────────────────────────────────────────────────────────────────── + +def make_wav(pcm: bytes, sample_rate: int = PIPER_SAMPLE_RATE) -> bytes: + """Prepend a 44-byte WAV header to raw signed-16-bit mono PCM.""" + channels = 1 + bits = 16 + byte_rate = sample_rate * channels * bits // 8 + block_align = channels * bits // 8 + header = struct.pack( + "<4sI4s4sIHHIIHH4sI", + b"RIFF", 36 + len(pcm), b"WAVE", + b"fmt ", 16, 1, channels, sample_rate, byte_rate, block_align, bits, + b"data", len(pcm), + ) + return header + pcm + + +def _fallback_tone(duration_s: float = 0.35) -> bytes: + """400 Hz sine burst — returned when Piper is unavailable.""" + n = int(PIPER_SAMPLE_RATE * duration_s) + buf = bytearray() + for i in range(n): + env = min(1.0, i / 800, (n - i) / 800) + buf.extend(struct.pack(" bytes: + """Synthesise one sentence with Piper; return raw PCM or b'' on failure.""" + text = text.strip() + if not text: + return b"" + try: + r = subprocess.run( + [PIPER_BIN, "--model", PIPER_MODEL, "--output_raw"], + input=text.encode(), + capture_output=True, + timeout=30, + ) + if r.returncode != 0: + log.warning("piper exit=%d stderr=%s", r.returncode, r.stderr[:120]) + return b"" + return r.stdout + except FileNotFoundError: + log.error("piper not found at %r", PIPER_BIN) + return b"" + except subprocess.TimeoutExpired: + log.error("piper timed out synthesising %d chars", len(text)) + return b"" + + +def make_error_wav(message: str) -> bytes: + """Return a spoken error WAV, or a short tone when Piper is unavailable.""" + pcm = tts_sentence(message) + return make_wav(pcm if pcm else _fallback_tone()) + + +# ────────────────────────────────────────────────────────────────────────────── +# STT — whisper.cpp +# ────────────────────────────────────────────────────────────────────────────── + +def transcribe(wav_path: str) -> str: + """Return transcript text, or '' on silence/failure.""" + txt_path = wav_path + ".txt" + try: + r = subprocess.run( + [WHISPER_BIN, "-m", WHISPER_MODEL, "-f", wav_path, + "-otxt", "-nt", "-l", "en"], + capture_output=True, + text=True, + timeout=60, + ) + if r.returncode != 0: + log.warning("whisper exit=%d stderr=%s", r.returncode, r.stderr[:200]) + return "" + + if os.path.exists(txt_path): + with open(txt_path) as f: + return f.read().strip() + + lines = [ + ln.strip() for ln in r.stdout.splitlines() + if ln.strip() and not ln.strip().startswith("[") + ] + return " ".join(lines) + + except FileNotFoundError: + log.error("whisper binary not found at %r", WHISPER_BIN) + return "" + except subprocess.TimeoutExpired: + log.error("whisper timed out") + return "" + finally: + try: + os.unlink(txt_path) + except OSError: + pass + + +# ────────────────────────────────────────────────────────────────────────────── +# Complexity classifier +# ────────────────────────────────────────────────────────────────────────────── + +_MATH_EXPR = re.compile(r"\d+\s*[×÷+\-*/^]\s*\d+", re.UNICODE) +_MATH_QUERY = re.compile(r"what\s+is\s+\d|how\s+much\s+is\s+\d|what'?s\s+\d+", re.I) +_TIME_WORDS = re.compile(r"\b(time|date|today|tomorrow|yesterday|day|month|year|hour|minute)\b", re.I) +_UNIT_CONV = re.compile(r"how\s+many\s+\w+\s+in\s+(?:a|an)\s+\w+|convert\s+\d|\d+\s+\w+\s+to\s+\w+", re.I) +_RESET_RE = re.compile( + r"\b(forget|reset|start.{0,4}over|fresh.{0,4}start|clear.{0,8}memory" + r"|new.{0,8}conversation|forget.{0,8}everything|start.{0,4}fresh" + r"|wipe.{0,8}memory|new.{0,4}chat)\b", + re.I, +) + +_SIMPLE_PHRASES: frozenset[str] = frozenset({ + "hello", "hi", "hey", "good morning", "good afternoon", "good evening", + "how are you", "are you awake", "are you there", + "what is your name", "what's your name", "who are you", "what can you do", + "stop", "cancel", "nevermind", "thanks", "thank you", + "okay", "ok", "yes", "no", "got it", "sounds good", +}) + + +def classify(transcript: str) -> str: + """Return 'local' for simple/fast queries, 'cloud' for complex ones.""" + t = transcript.strip().lower().rstrip(".!? ") + word_count = len(t.split()) + + if t in _SIMPLE_PHRASES: + return "local" + if (_MATH_EXPR.search(transcript) or _MATH_QUERY.search(transcript)) and word_count <= 12: + return "local" + if _TIME_WORDS.search(transcript) and word_count <= 8: + return "local" + if _UNIT_CONV.search(transcript) and word_count <= 15: + return "local" + if word_count <= 7 and transcript.strip().endswith("?") and "." not in transcript[:-1]: + return "local" + + return "cloud" + + +def needs_reset(transcript: str) -> bool: + """Return True if the user wants to clear conversation memory.""" + return bool(_RESET_RE.search(transcript)) + + +# ────────────────────────────────────────────────────────────────────────────── +# Conversation memory +# ────────────────────────────────────────────────────────────────────────────── + +class ConversationExpired(Exception): + """Raised when the stored conversation_id returns 404 from Arbiter.""" + + +class ConversationManager: + """Persists a single Arbiter conversation_id to disk. + + Cloud-tier turns are sent through /v1/conversations/:id/messages so the + agent accumulates history across restarts. Local-tier turns are always + stateless and bypass this class entirely. + + Thread-safe: all mutable state is guarded by _lock. + """ + + def __init__(self, path: str, cloud_agent: str) -> None: + self._path = path + self._agent = cloud_agent + self._lock = threading.Lock() + self._id: int | None = self._load() + if self._id is not None: + log.info("conversation loaded id=%d from %s", self._id, path) + else: + log.info("no stored conversation — will create on first cloud turn") + + # ── public ──────────────────────────────────────────────────────────────── + + def get_or_create(self) -> int: + """Return the current conversation_id, creating one if needed.""" + with self._lock: + if self._id is None: + self._id = self._create() + self._save() + return self._id + + def reset(self) -> int: + """Discard the current conversation and start a new one.""" + with self._lock: + old = self._id + self._id = self._create() + self._save() + log.info("conversation reset old=%s new=%d", old, self._id) + return self._id + + def mark_expired(self, stale_id: int) -> None: + """Called after a 404 — clears the ID so next call creates fresh.""" + with self._lock: + if self._id == stale_id: + self._id = None + + # ── private ─────────────────────────────────────────────────────────────── + + def _load(self) -> int | None: + try: + with open(self._path) as f: + return int(json.load(f)["conversation_id"]) + except (FileNotFoundError, KeyError, ValueError, json.JSONDecodeError): + return None + + def _save(self) -> None: + try: + os.makedirs(os.path.dirname(os.path.abspath(self._path)), exist_ok=True) + with open(self._path, "w") as f: + json.dump({"conversation_id": self._id}, f) + except OSError as exc: + log.warning("failed to save conversation state: %s", exc) + + def _create(self) -> int: + body = json.dumps({"agent_id": self._agent}).encode() + headers = { + "Content-Type": "application/json", + "Content-Length": str(len(body)), + "Authorization": f"Bearer {ARBITER_TOKEN}", + } + conn = http.client.HTTPConnection(ARBITER_HOST, ARBITER_PORT, timeout=30) + try: + conn.request("POST", "/v1/conversations", body=body, headers=headers) + resp = conn.getresponse() + raw = resp.read() + if resp.status != 201: + raise RuntimeError( + f"create conversation HTTP {resp.status}: {raw[:120]!r}" + ) + return int(json.loads(raw)["id"]) + finally: + conn.close() + + +# Module-level singleton. Initialised in main() when BRIDGE_CONV_FILE is set. +_conv_mgr: ConversationManager | None = None + + +# ────────────────────────────────────────────────────────────────────────────── +# Arbiter SSE client +# ────────────────────────────────────────────────────────────────────────────── + +def _parse_sse(resp: http.client.HTTPResponse) -> Iterator[str]: + """Yield text deltas from an open Arbiter SSE response.""" + event_type = "message" + data_lines: list[str] = [] + + while True: + raw = resp.readline() + if not raw: + break + line = raw.decode("utf-8", errors="replace").rstrip("\r\n") + + if line.startswith(":"): + continue + + if not line: + if data_lines: + try: + data = json.loads("\n".join(data_lines)) + except json.JSONDecodeError: + data = {} + + if event_type == "text": + delta = data.get("delta", "") + if delta: + yield delta + elif event_type == "done": + return + elif event_type == "error": + raise RuntimeError(data.get("error", "arbiter error")) + + event_type = "message" + data_lines = [] + continue + + if line.startswith("event:"): + event_type = line[6:].strip() + elif line.startswith("data:"): + data_lines.append(line[5:].strip()) + + +def stream_arbiter_text(agent: str, message: str, idkey: str) -> Iterator[str]: + """Stateless /v1/orchestrate — used for local-tier queries.""" + body = json.dumps({"agent": agent, "message": message}).encode() + headers = { + "Content-Type": "application/json", + "Content-Length": str(len(body)), + "Authorization": f"Bearer {ARBITER_TOKEN}", + "Idempotency-Key": idkey, + "Accept": "text/event-stream", + } + conn = http.client.HTTPConnection(ARBITER_HOST, ARBITER_PORT, timeout=120) + try: + conn.request("POST", "/v1/orchestrate", body=body, headers=headers) + resp = conn.getresponse() + if resp.status != 200: + snippet = resp.read(256).decode("utf-8", errors="replace") + raise RuntimeError(f"arbiter HTTP {resp.status}: {snippet}") + yield from _parse_sse(resp) + finally: + conn.close() + + +def stream_conversation_text( + conversation_id: int, message: str, idkey: str +) -> Iterator[str]: + """Memory-enabled /v1/conversations/:id/messages — used for cloud-tier queries.""" + body = json.dumps({"message": message}).encode() + headers = { + "Content-Type": "application/json", + "Content-Length": str(len(body)), + "Authorization": f"Bearer {ARBITER_TOKEN}", + "Idempotency-Key": idkey, + "Accept": "text/event-stream", + } + path = f"/v1/conversations/{conversation_id}/messages" + conn = http.client.HTTPConnection(ARBITER_HOST, ARBITER_PORT, timeout=120) + try: + conn.request("POST", path, body=body, headers=headers) + resp = conn.getresponse() + if resp.status == 404: + resp.read() + raise ConversationExpired() + if resp.status != 200: + snippet = resp.read(256).decode("utf-8", errors="replace") + raise RuntimeError(f"arbiter HTTP {resp.status}: {snippet}") + yield from _parse_sse(resp) + finally: + conn.close() + + +def _cloud_stream(message: str, idkey: str) -> Iterator[str]: + """Route cloud-tier turn through conversation if memory is enabled, else stateless.""" + if _conv_mgr is None: + yield from stream_arbiter_text(ARBITER_CLOUD, message, idkey) + return + + for attempt in range(2): + cid = _conv_mgr.get_or_create() + try: + yield from stream_conversation_text(cid, message, idkey) + return + except ConversationExpired: + log.info("conversation %d expired (attempt %d), resetting", cid, attempt + 1) + _conv_mgr.mark_expired(cid) + + raise RuntimeError("conversation unavailable after two attempts") + + +# ────────────────────────────────────────────────────────────────────────────── +# Sentence splitting +# ────────────────────────────────────────────────────────────────────────────── + +_SENT_END = re.compile(r"(?<=[.!?])\s+") + + +def extract_sentences(buf: str) -> tuple[list[str], str]: + """Return (complete sentences, trailing remainder) for progressive TTS.""" + parts = _SENT_END.split(buf) + if len(parts) <= 1: + return [], buf + return parts[:-1], parts[-1] + + +# ────────────────────────────────────────────────────────────────────────────── +# Full turn pipeline +# ────────────────────────────────────────────────────────────────────────────── + +def process_utterance( + wav_bytes: bytes, + *, + source: str = "-", + complexity_hint: str = "", +) -> tuple[bytes, str]: + """Run the full pipeline; return (wav_bytes, transcript).""" + turn_id = str(uuid.uuid4()) + t0 = time.monotonic() + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf: + tf.write(wav_bytes) + wav_path = tf.name + + try: + transcript = transcribe(wav_path) + finally: + try: + os.unlink(wav_path) + except OSError: + pass + + t_stt = time.monotonic() + log.info("turn=%s src=%s stt=%.0fms transcript=%r", + turn_id, source, (t_stt - t0) * 1000, transcript[:80]) + + if not transcript: + return make_error_wav("I didn't catch that. Please try again."), "" + + # Classify — reset commands bypass the normal tier logic. + if needs_reset(transcript): + if _conv_mgr is not None: + new_id = _conv_mgr.reset() + log.info("turn=%s memory reset new_conversation=%d", turn_id, new_id) + tier = "cloud" + log.info("turn=%s tier=reset->cloud", turn_id) + text_stream = _cloud_stream(transcript, turn_id) + else: + tier = complexity_hint if complexity_hint in ("local", "cloud") else classify(transcript) + log.info("turn=%s tier=%s", turn_id, tier) + if tier == "local": + text_stream = stream_arbiter_text(ARBITER_LOCAL, transcript, turn_id) + else: + text_stream = _cloud_stream(transcript, turn_id) + + # Stream Arbiter text; synthesise each sentence as it completes so Piper + # runs concurrently with the model generating the next sentence. + pcm_futures: list[Future[bytes]] = [] + text_buf = "" + + try: + with ThreadPoolExecutor(max_workers=2) as tts_pool: + for delta in text_stream: + text_buf += delta + sentences, text_buf = extract_sentences(text_buf) + for sent in sentences: + pcm_futures.append(tts_pool.submit(tts_sentence, sent)) + + if text_buf.strip(): + pcm_futures.append(tts_pool.submit(tts_sentence, text_buf.strip())) + + pcm_parts = [f.result() for f in pcm_futures] + + except RuntimeError as exc: + log.error("turn=%s arbiter error: %s", turn_id, exc) + return make_error_wav("I ran into a problem. Please try again."), transcript + + total_pcm = b"".join(p for p in pcm_parts if p) + t_done = time.monotonic() + log.info("turn=%s total=%.0fms sentences=%d pcm_bytes=%d", + turn_id, (t_done - t0) * 1000, len(pcm_futures), len(total_pcm)) + + if not total_pcm: + return make_error_wav("I don't have a response for that."), transcript + + return make_wav(total_pcm), transcript + + +# ────────────────────────────────────────────────────────────────────────────── +# HTTP handler +# ────────────────────────────────────────────────────────────────────────────── + +class Handler(BaseHTTPRequestHandler): + server_version = "arbiter-voice-bridge/1.0" + + def do_POST(self) -> None: + if self.path == "/v1/utterance": + self._handle_utterance() + elif self.path == "/v1/transcribe": + self._handle_transcribe() + else: + self.send_error(404, "not found") + + def do_GET(self) -> None: + if self.path == "/health": + self._send_text(200, "ok\n") + else: + self.send_error(404, "not found") + + def _handle_utterance(self) -> None: + body = self._read_audio_body() + if body is None: + return + hint = self.headers.get("X-Complexity-Hint", "").lower().strip() + wav, _ = process_utterance(body, source=self.client_address[0], complexity_hint=hint) + self._send_wav(wav) + + def _handle_transcribe(self) -> None: + """STT-only — useful for debug and latency measurement.""" + body = self._read_audio_body() + if body is None: + return + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf: + tf.write(body) + wav_path = tf.name + try: + transcript = transcribe(wav_path) + finally: + try: + os.unlink(wav_path) + except OSError: + pass + self._send_json(200, json.dumps({"transcript": transcript}).encode()) + + def _read_audio_body(self) -> bytes | None: + if not self._authorized(): + self.send_error(401, "unauthorized") + return None + ip = self.client_address[0] + if not self._within_rate_limit(ip): + self.send_error(429, "rate limit exceeded") + return None + try: + length = int(self.headers.get("Content-Length") or "0") + except ValueError: + self.send_error(400, "invalid content-length") + return None + if length <= 0: + self.send_error(400, "empty body") + return None + if length > BRIDGE_MAX_BYTES: + self.send_error(413, "upload too large") + return None + body = self.rfile.read(length) + if len(body) != length: + self.send_error(400, "short read") + return None + return body + + def _authorized(self) -> bool: + if not BRIDGE_API_KEY: + return True + return self.headers.get("Authorization", "") == f"Bearer {BRIDGE_API_KEY}" + + def _within_rate_limit(self, key: str) -> bool: + now = time.monotonic() + with _rate_lock: + recent = [t for t in _rate_table.get(key, []) if now - t < _RATE_WINDOW] + if len(recent) >= BRIDGE_RATE_LIMIT: + _rate_table[key] = recent + return False + recent.append(now) + _rate_table[key] = recent + return True + + def _send_wav(self, wav: bytes) -> None: + self.send_response(200) + self.send_header("Content-Type", "audio/wav") + self.send_header("Content-Length", str(len(wav))) + self.send_header("Connection", "close") + self.end_headers() + self.wfile.write(wav) + + def _send_json(self, status: int, payload: bytes) -> None: + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(payload))) + self.send_header("Connection", "close") + self.end_headers() + self.wfile.write(payload) + + def _send_text(self, status: int, text: str) -> None: + body = text.encode() + self.send_response(status) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt: str, *args: object) -> None: + log.info("%s - %s", self.address_string(), fmt % args) + + +# ────────────────────────────────────────────────────────────────────────────── +# Entry point +# ────────────────────────────────────────────────────────────────────────────── + +def main() -> int: + global _conv_mgr + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-5s %(name)s %(message)s", + datefmt="%H:%M:%S", + ) + + parser = argparse.ArgumentParser(description="Arbiter voice bridge") + parser.add_argument("--host", default="") + parser.add_argument("--port", type=int, default=8081) + a = parser.parse_args() + + if not BRIDGE_API_KEY: + log.warning("BRIDGE_API_KEY is not set — binding to loopback only") + host = a.host or "127.0.0.1" + else: + host = a.host or "0.0.0.0" + + if BRIDGE_CONV_FILE: + _conv_mgr = ConversationManager(BRIDGE_CONV_FILE, cloud_agent=ARBITER_CLOUD) + log.info("memory=on file=%s", BRIDGE_CONV_FILE) + else: + log.info("memory=off (set BRIDGE_CONVERSATION_FILE to enable)") + + log.info("arbiter=%s:%d local=%s cloud=%s", + ARBITER_HOST, ARBITER_PORT, ARBITER_LOCAL, ARBITER_CLOUD) + log.info("whisper bin=%-20s model=%s", WHISPER_BIN, WHISPER_MODEL) + log.info("piper bin=%-20s model=%s sample_rate=%d", + PIPER_BIN, PIPER_MODEL, PIPER_SAMPLE_RATE) + + server = ThreadingHTTPServer((host, a.port), Handler) + log.info("listening on http://%s:%d", host, a.port) + server.serve_forever() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/include/advisor.h b/include/advisor.h new file mode 100644 index 0000000..3ea085e --- /dev/null +++ b/include/advisor.h @@ -0,0 +1,63 @@ +#pragma once +// arbiter/include/advisor.h — Framework-agnostic advisor-gate decision. +// +// The advisor gate's *decision* — "is this terminating turn acceptable to +// return to the caller?" — is a pure, stateless function: one history-less +// model call in, one CONTINUE/REDIRECT/HALT signal out. It is deliberately +// split from the orchestrator so two callers can share one implementation: +// +// • Orchestrator::make_advisor_gate_invoker — the in-loop gate that fires +// on an executor's terminating turn (src/orchestrator.cpp). +// • The HTTP handler for POST /v1/advise/gate — exposes the same decision +// to external frameworks that own their own executor loop. +// +// What does NOT live here, by design: the *enforcement* ("the executor +// cannot return without CONTINUE"), the redirect budget, and synthetic-turn +// re-injection. Those are loop-control concerns owned by whoever drives the +// executor. Over HTTP the calling framework must honour the verdict itself — +// the unbypassability guarantee is a property of owning the loop, not of this +// function. See docs/concepts/advisor.md. + +#include "commands.h" // AdvisorGateInput / AdvisorGateOutput / parse_advisor_signal + +#include +#include + +namespace arbiter { + +class ApiClient; +struct ApiResponse; + +// The runtime gate's default system prompt. Exposed so the in-loop gate and +// the standalone endpoint share one source of truth; callers override it via +// the `prompt_override` argument (constitution `advisor.prompt`, or the +// request body's `advisor.prompt`). +const char* default_gate_prompt(); + +// Make one history-less advisor call and parse its reply into a gate signal. +// +// advisor_model provider-prefixed model id (e.g. "claude-opus-4-7"). +// Empty ⇒ returns Halt(malformed) — defence-in-depth; the +// caller is expected to have checked mode == "gate" first. +// prompt_override gate system-prompt override; empty ⇒ default_gate_prompt(). +// in structured executor context (task, terminating text, +// tool summary). +// on_response optional hook fired with the raw ApiResponse after the +// call — used by the orchestrator to attribute cost to the +// caller's ledger. A standalone deployment wires its own +// metering here (the runtime keeps no usage ledger). +// +// On transport/model error returns kind=Halt, malformed=true, with the +// provider error in `text`/`raw`. This function applies NO fail-open / +// fail-closed policy on a *parseable-but-malformed* reply — it returns the +// parser's verdict (which may be kind=Continue with malformed=true) and lets +// the caller apply its own `malformed_halts` policy, exactly as the in-loop +// gate does. +AdvisorGateOutput run_advisor_gate( + ApiClient& client, + const std::string& advisor_model, + const std::string& prompt_override, + const AdvisorGateInput& in, + const std::function& on_response = nullptr); + +} // namespace arbiter diff --git a/src/advisor.cpp b/src/advisor.cpp new file mode 100644 index 0000000..30437a0 --- /dev/null +++ b/src/advisor.cpp @@ -0,0 +1,100 @@ +// src/advisor.cpp — Framework-agnostic advisor-gate decision. +// +// The signal *parser* (parse_advisor_signal) lives in src/advisor_gate.cpp so +// it can be unit-tested without linking the provider client. This translation +// unit adds the part that needs ApiClient: formatting the gate prompt and +// making the call. Both the orchestrator's in-loop gate and the standalone +// POST /v1/advise/gate handler route through run_advisor_gate. + +#include "advisor.h" +#include "api_client.h" +#include "commands.h" + +#include +#include + +namespace arbiter { + +const char* default_gate_prompt() { + // The prompt explicitly enumerates the three signals and the tag-based + // grammar — the parser is strict about tag form, so the model must + // produce it verbatim. Kept identical to the wording shipped before the + // gate was factored out of the orchestrator. + return + "You are a runtime gate evaluating whether an executor agent's " + "terminating turn is acceptable to return to the caller.\n\n" + "Inputs you receive (in this order):\n" + " - The original user task.\n" + " - The executor's outputs for the terminating turn (text only — " + "no reasoning, no prior turns).\n" + " - A structured summary of tool calls made this turn.\n\n" + "You will respond with EXACTLY ONE signal on its own line:\n\n" + " CONTINUE\n" + " The terminating turn satisfies the task; let the executor return.\n\n" + " REDIRECT\n" + " ...\n" + " The executor is on the wrong track or stopped early. Provide a " + "concrete next step in . This will be injected as a " + "synthetic user turn back to the executor.\n\n" + " HALT\n" + " ...\n" + " The executor produced something the user must see before any " + "further work — irreversible footgun about to commit, scope " + "explosion, confidential data leak, fundamentally wrong premise. " + "This will be surfaced to the user as an escalation.\n\n" + "No preamble. No markdown. Output exactly one signal. Default " + "to CONTINUE when the turn is merely terse but correct. Default " + "to HALT when in doubt about safety; default to REDIRECT when in " + "doubt about correctness."; +} + +AdvisorGateOutput run_advisor_gate( + ApiClient& client, + const std::string& advisor_model, + const std::string& prompt_override, + const AdvisorGateInput& in, + const std::function& on_response) { + + AdvisorGateOutput out; + + if (advisor_model.empty()) { + // Defence-in-depth: callers should already have checked + // mode == "gate", but if a misconfiguration slips through we fail + // closed with a HALT explaining the issue. + out.kind = AdvisorGateOutput::Kind::Halt; + out.text = "no advisor model configured for gate"; + out.malformed = true; + return out; + } + + std::ostringstream q; + q << "[ORIGINAL TASK]\n" << in.original_task << "\n[END ORIGINAL TASK]\n\n" + << "[EXECUTOR TERMINATING TURN]\n" << in.terminating_text + << "\n[END EXECUTOR TERMINATING TURN]\n\n" + << "[TOOL CALLS THIS TURN]\n" + << (in.tool_summary.empty() ? "(none)\n" : in.tool_summary) + << "[END TOOL CALLS]\n"; + + ApiRequest req; + req.model = advisor_model; + req.max_tokens = 512; // signals are short + req.include_temperature = false; // reasoning models reject temperature + req.system_prompt = prompt_override.empty() + ? std::string(default_gate_prompt()) + : prompt_override; + req.messages = {{"user", q.str()}}; + + ApiResponse resp = client.complete(req); + if (on_response) on_response(resp); + if (!resp.ok) { + out.kind = AdvisorGateOutput::Kind::Halt; + out.text = "advisor API error: " + resp.error; + out.malformed = true; + out.raw = resp.error; + return out; + } + + return parse_advisor_signal(resp.content); +} + +} // namespace arbiter diff --git a/src/orchestrator.cpp b/src/orchestrator.cpp index 29ce809..53b6c6d 100644 --- a/src/orchestrator.cpp +++ b/src/orchestrator.cpp @@ -1,5 +1,6 @@ // arbiter/src/orchestrator.cpp #include "orchestrator.h" +#include "advisor.h" #include "commands.h" #include "config.h" #include "tui/stream_filter.h" @@ -502,14 +503,15 @@ AdvisorInvoker Orchestrator::make_advisor_invoker(const std::string& caller_id) AdvisorGateInvoker Orchestrator::make_advisor_gate_invoker(const std::string& caller_id) { return [this, caller_id](const AdvisorGateInput& in) -> AdvisorGateOutput { - AdvisorGateOutput out; - // Resolve the advisor model + optional prompt override from the // caller's constitution. The structured `advisor` block is the // source of truth for gate behaviour; the legacy `advisor_model` // field is consulted only as a fallback when the structured model // is empty (which can happen if a caller wired the gate via - // configuration outside the JSON parser path). + // configuration outside the JSON parser path). The actual + // formatting + provider call + signal parse lives in + // run_advisor_gate (src/advisor.cpp), shared with the standalone + // POST /v1/advise/gate endpoint. std::string advisor_model; std::string prompt_override; if (caller_id == "index") { @@ -521,6 +523,7 @@ AdvisorGateInvoker Orchestrator::make_advisor_gate_invoker(const std::string& ca std::lock_guard lk(agents_mutex_); auto it = agents_.find(caller_id); if (it == agents_.end()) { + AdvisorGateOutput out; out.kind = AdvisorGateOutput::Kind::Halt; out.text = "no agent '" + caller_id + "' for gate"; out.malformed = true; @@ -531,75 +534,13 @@ AdvisorGateInvoker Orchestrator::make_advisor_gate_invoker(const std::string& ca : cfg.advisor.model; prompt_override = cfg.advisor.prompt; } - if (advisor_model.empty()) { - // Defence-in-depth: callers should already have checked - // mode == "gate", but if a misconfiguration slips through - // we fail closed with a HALT explaining the issue. - out.kind = AdvisorGateOutput::Kind::Halt; - out.text = "no advisor model configured for gate on '" + caller_id + "'"; - out.malformed = true; - return out; - } - - // Default gate prompt. Tenant can override via advisor.prompt. - // The prompt explicitly enumerates the three signals and the - // tag-based grammar — the parser is strict about tag form, so the - // model must produce it verbatim. - static constexpr const char* kDefaultGatePrompt = - "You are a runtime gate evaluating whether an executor agent's " - "terminating turn is acceptable to return to the caller.\n\n" - "Inputs you receive (in this order):\n" - " - The original user task.\n" - " - The executor's outputs for the terminating turn (text only — " - "no reasoning, no prior turns).\n" - " - A structured summary of tool calls made this turn.\n\n" - "You will respond with EXACTLY ONE signal on its own line:\n\n" - " CONTINUE\n" - " The terminating turn satisfies the task; let the executor return.\n\n" - " REDIRECT\n" - " ...\n" - " The executor is on the wrong track or stopped early. Provide a " - "concrete next step in . This will be injected as a " - "synthetic user turn back to the executor.\n\n" - " HALT\n" - " ...\n" - " The executor produced something the user must see before any " - "further work — irreversible footgun about to commit, scope " - "explosion, confidential data leak, fundamentally wrong premise. " - "This will be surfaced to the user as an escalation.\n\n" - "No preamble. No markdown. Output exactly one signal. Default " - "to CONTINUE when the turn is merely terse but correct. Default " - "to HALT when in doubt about safety; default to REDIRECT when in " - "doubt about correctness."; - - std::ostringstream q; - q << "[ORIGINAL TASK]\n" << in.original_task << "\n[END ORIGINAL TASK]\n\n" - << "[EXECUTOR TERMINATING TURN]\n" << in.terminating_text - << "\n[END EXECUTOR TERMINATING TURN]\n\n" - << "[TOOL CALLS THIS TURN]\n" - << (in.tool_summary.empty() ? "(none)\n" : in.tool_summary) - << "[END TOOL CALLS]\n"; - - ApiRequest req; - req.model = advisor_model; - req.max_tokens = 512; // signals are short - req.include_temperature = false; // reasoning models reject temperature - req.system_prompt = prompt_override.empty() - ? std::string(kDefaultGatePrompt) - : prompt_override; - req.messages = {{"user", q.str()}}; - ApiResponse resp = client_.complete(req); - if (cost_cb_) cost_cb_(caller_id, advisor_model, resp); - if (!resp.ok) { - out.kind = AdvisorGateOutput::Kind::Halt; - out.text = "advisor API error: " + resp.error; - out.malformed = true; - out.raw = resp.error; - return out; - } - - return parse_advisor_signal(resp.content); + // Attribute the advisor's cost to the caller's ledger with the + // advisor model's pricing — same as the /advise consult path. + return run_advisor_gate(client_, advisor_model, prompt_override, in, + [this, &caller_id, &advisor_model](const ApiResponse& resp) { + if (cost_cb_) cost_cb_(caller_id, advisor_model, resp); + }); }; } From 9cac24418e419056a447dc893c18f5ad49b81967 Mon Sep 17 00:00:00 2001 From: Tyler Reckart Date: Tue, 23 Jun 2026 09:18:17 -0400 Subject: [PATCH 2/4] hardware event bus --- CMakeLists.txt | 1 + include/api_client.h | 5 + include/api_server.h | 1 + include/cli.h | 3 +- include/config.h | 5 + include/constitution.h | 8 ++ include/orchestrator.h | 23 ++++- include/repl/queues.h | 13 +++ src/api_client.cpp | 70 ++++++++++---- src/api_server.cpp | 202 +++++++++++++++++++++++++++++++++++++++- src/cli.cpp | 21 ++++- src/commands.cpp | 42 +++++++-- src/constitution.cpp | 12 +++ src/main.cpp | 89 ++++++++++++++++-- src/orchestrator.cpp | 107 +++++++++++++++++++-- src/repl/queues.cpp | 38 +++++--- tests/test_commands.cpp | 92 ++++++++++++++++++ 17 files changed, 669 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea45605..751b205 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,7 @@ set(ARBITER_SOURCES src/constitution.cpp src/orchestrator.cpp src/advisor_gate.cpp + src/advisor.cpp src/api_server.cpp src/sandbox.cpp src/idempotency_cache.cpp diff --git a/include/api_client.h b/include/api_client.h index ff96d06..7a11bdb 100644 --- a/include/api_client.h +++ b/include/api_client.h @@ -142,12 +142,14 @@ class ApiClient { // and record success/failure on the result. Non-owning pointer; // null is fine and disables the breaker layer (legacy behaviour). void set_circuit_breaker(ProviderCircuitBreaker* cb) { breaker_ = cb; } + ProviderCircuitBreaker* circuit_breaker() const { return breaker_; } // Attach the process-wide metrics registry. When set, every // upstream call increments arbiter_provider_calls_total and the // appropriate retry / 5xx / 429 counters. Non-owning; null is // fine. void set_metrics(Metrics* m) { metrics_ = m; } + Metrics* metrics() const { return metrics_; } // Interrupt any in-progress streaming call. Shuts down every open socket // so an in-flight SSL_read / read returns immediately. Thread-safe. @@ -189,6 +191,9 @@ class ApiClient { std::vector mask; }; std::map api_keys_; + // Returns the unmasked key. The caller must zero the returned string + // before it goes out of scope; call sites wrap it in SensitiveString + // (defined in api_client.cpp) for automatic zeroing on scope exit. std::string unmask_api_key(const std::string& provider) const; SSL_CTX* ssl_ctx_ = nullptr; diff --git a/include/api_server.h b/include/api_server.h index 6747aea..99e20c9 100644 --- a/include/api_server.h +++ b/include/api_server.h @@ -101,6 +101,7 @@ struct ApiServerOptions { std::map api_keys; // provider name → key bool exec_disabled = true; // /exec policy + bool host_exec_enabled = false; // true → /exec via popen() on host; see --allow-host-exec size_t file_max_bytes = 10 * 1024 * 1024; // per-response cap // ── Per-tenant sandbox ─────────────────────────────────────────── diff --git a/include/cli.h b/include/cli.h index f5a1ae0..5d6ad3d 100644 --- a/include/cli.h +++ b/include/cli.h @@ -29,7 +29,8 @@ namespace arbiter { // writes ones that don't exist yet — re-run with force=true (CLI: --force) // to reset every starter back to the embedded definition. void cmd_init(bool force = false); -void cmd_api(int port, const std::string& bind, bool verbose); +void cmd_api(int port, const std::string& bind, bool verbose, + bool allow_host_exec = false); void cmd_oneshot(const std::string& agent_id, const std::string& msg); // Tenant admin. Each opens ~/.arbiter/tenants.db, runs one operation, and diff --git a/include/config.h b/include/config.h index 0f1cb43..fbbe766 100644 --- a/include/config.h +++ b/include/config.h @@ -13,6 +13,11 @@ struct Config { // by StreamFilter and ToolCallIndicator surfaces a single "N tool calls…" // spinner in the status bar for the duration of the turn. bool verbose = false; + + // When false (set by --no-exec), /exec commands are blocked in TUI mode + // just as they are in API mode when no sandbox invoker is wired up. + // Default true so existing TUI behaviour is unchanged. + bool exec_allowed = true; }; } // namespace arbiter diff --git a/include/constitution.h b/include/constitution.h index cf8ffaf..1af7cb8 100644 --- a/include/constitution.h +++ b/include/constitution.h @@ -121,6 +121,14 @@ struct Constitution { // Example: {"/fetch", "/mem"} for researcher, {"/exec", "/write"} for devops. std::vector capabilities; + // --- Event routing --- + // Glob patterns for hardware/software events this agent handles via + // POST /v1/events. Matched with fnmatch(pattern, type, 0) — '*' + // matches any sequence of characters including dots, so "sensor.*" + // matches "sensor.temp.threshold_exceeded". Empty vector → agent is + // not event-routed (events fall through to index by default). + std::vector event_types; + // --- Computed --- std::string build_system_prompt() const; diff --git a/include/orchestrator.h b/include/orchestrator.h index ffd4992..543ecab 100644 --- a/include/orchestrator.h +++ b/include/orchestrator.h @@ -14,6 +14,14 @@ namespace arbiter { +// Scan *.json agent files in agents_dir for the first agent whose event_types +// array contains a glob pattern matching event_type. Returns that agent's id +// (Constitution::name or filename stem), or "index" if no match. Reads +// constitution files on each call — intended for the infrequent /v1/events +// dispatch path, not a hot loop. +std::string route_event(const std::string& agents_dir, + const std::string& event_type); + class Orchestrator { public: explicit Orchestrator(std::map api_keys); @@ -341,7 +349,10 @@ class Orchestrator { void save_session(const std::string& path) const; bool load_session(const std::string& path); // returns true if anything loaded - // Token tracking + // Token tracking — counts the shared client only. Per-child ApiClients + // created for /parallel turns track their own counters independently, so + // these totals undercount tokens spent in parallel turns. Per-turn cost + // attribution via cost_cb_ is unaffected (it reads ApiResponse directly). int total_input_tokens() const { return client_.total_input_tokens(); } int total_output_tokens() const { return client_.total_output_tokens(); } @@ -377,6 +388,16 @@ class Orchestrator { private: ApiClient client_; + // Stored so make_parallel_invoker can create per-child ApiClient instances + // (each with its own connection pool) instead of sharing the parent's + // conn_mutex_. Keys are plaintext — same exposure as the constructor arg. + std::map api_keys_; + // Non-owning pointers to child ApiClients active during a /parallel turn. + // cancel() iterates this under parallel_clients_mu_ so a cancel request + // reaches in-flight parallel children, not just the parent client. + std::mutex parallel_clients_mu_; + std::vector parallel_clients_; + std::unordered_map> agents_; mutable std::mutex agents_mutex_; std::string memory_dir_; diff --git a/include/repl/queues.h b/include/repl/queues.h index 5fe6444..f3d9331 100644 --- a/include/repl/queues.h +++ b/include/repl/queues.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,10 @@ class OutputQueue { // a single end_message() make up one logical message. If a prior // message ended (via end_message or push_msg), the first push that // follows automatically gets a blank-line separator prepended. + // + // If a notify function has been wired in via set_notify_fn(), it is + // called after appending so the pump thread can wake immediately instead + // of waiting for its next timer tick. void push(const std::string& s); // Mark the current message as complete. Idempotent — multiple @@ -74,12 +79,20 @@ class OutputQueue { std::string drain(); + // Wire a callback to be fired (without holding mu_) on every push(). + // The pump thread sets this to a closure that signals its condition + // variable; call sites don't need to know the CV exists. + void set_notify_fn(std::function fn); + private: std::mutex mu_; std::string buf_; // True when the previous push ended a message — the next push applies // exactly one blank-line separator before appending its content. bool need_sep_ = false; + + // Optional pump-wakeup hook — set by the pump thread at startup. + std::function notify_fn_; }; } // namespace arbiter diff --git a/src/api_client.cpp b/src/api_client.cpp index 6438bbd..6fbd66f 100644 --- a/src/api_client.cpp +++ b/src/api_client.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -208,6 +209,16 @@ ApiClient::ApiClient(std::map api_keys) { for (size_t i = 0; i < key.size(); ++i) { mk.masked[i] = static_cast(key[i]) ^ mk.mask[i]; } + // Lock both buffers into RAM so they are never written to swap. + // mlock() may fail when the process hits RLIMIT_MEMLOCK (common in + // containers); degrade gracefully — the XOR masking still protects + // against simple credential-scanner passes. + if (::mlock(mk.masked.data(), mk.masked.size()) != 0 || + ::mlock(mk.mask.data(), mk.mask.size()) != 0) { + ::fprintf(stderr, + "WARN: mlock failed for API key '%s' — key may appear in swap\n", + name.c_str()); + } // Wipe the plaintext from the input map before dropping it. OPENSSL_cleanse(key.data(), key.size()); api_keys_.emplace(name, std::move(mk)); @@ -216,14 +227,39 @@ ApiClient::ApiClient(std::map api_keys) { ApiClient::~ApiClient() { for (auto& [_, mk] : api_keys_) { - if (!mk.masked.empty()) OPENSSL_cleanse(mk.masked.data(), mk.masked.size()); - if (!mk.mask.empty()) OPENSSL_cleanse(mk.mask.data(), mk.mask.size()); + // Unlock before zeroing — munlock after cleanse would be a no-op + // on the already-zeroed pages, but order matters for correctness. + if (!mk.masked.empty()) { + ::munlock(mk.masked.data(), mk.masked.size()); + OPENSSL_cleanse(mk.masked.data(), mk.masked.size()); + } + if (!mk.mask.empty()) { + ::munlock(mk.mask.data(), mk.mask.size()); + OPENSSL_cleanse(mk.mask.data(), mk.mask.size()); + } } api_keys_.clear(); for (auto& [_, c] : conns_) close_connection(c); if (ssl_ctx_) SSL_CTX_free(ssl_ctx_); } +// RAII wrapper that zeros a string's buffer on scope exit regardless of +// how the scope exits (return, exception, etc.). Used for the short-lived +// plaintext copies produced by unmask_api_key so the key is zeroed even +// when an exception unwinds through the call site before the manual wipe. +struct SensitiveString { + std::string value; + SensitiveString() = default; + explicit SensitiveString(std::string s) : value(std::move(s)) {} + SensitiveString(const SensitiveString&) = delete; + SensitiveString& operator=(const SensitiveString&) = delete; + SensitiveString(SensitiveString&&) = default; + SensitiveString& operator=(SensitiveString&&) = default; + ~SensitiveString() { + if (!value.empty()) OPENSSL_cleanse(value.data(), value.size()); + } +}; + std::string ApiClient::unmask_api_key(const std::string& provider) const { auto it = api_keys_.find(provider); if (it == api_keys_.end()) return {}; @@ -664,32 +700,33 @@ void ApiClient::send_request(const Provider& p, Conn& c, } http << "\r\n"; http << "Content-Type: application/json\r\n"; - // Materialise the plaintext key only while building the request header, - // then wipe it below before the call returns. Limits the window during - // which the raw token is present in process memory. Missing key for a - // provider that requires one → clean error via the caller's catch(...) + // Materialise the plaintext key only while building the request header. + // SensitiveString zeroes its buffer on scope exit regardless of how the + // scope exits (normal return, throw, etc.), so the key can't leak through + // an exception path that bypasses the manual wipe below. Missing key for + // a provider that requires one → clean error via the caller's catch(...) // in complete() / stream(). - std::string key_plain; + SensitiveString key_sensitive; if (p.uses_api_key) { if (api_keys_.find(p.name) == api_keys_.end()) { throw std::runtime_error( "No API key configured for provider '" + p.name + "'"); } - key_plain = unmask_api_key(p.name); - if (key_plain.empty()) { + key_sensitive.value = unmask_api_key(p.name); + if (key_sensitive.value.empty()) { throw std::runtime_error( "No API key configured for provider '" + p.name + "'"); } if (p.format == Provider::FORMAT_ANTHROPIC) { - http << "x-api-key: " << key_plain << "\r\n"; + http << "x-api-key: " << key_sensitive.value << "\r\n"; http << "anthropic-version: 2023-06-01\r\n"; http << "anthropic-beta: prompt-caching-2024-07-31\r\n"; } else if (p.format == Provider::FORMAT_GEMINI) { // Gemini supports both `?key=…` and `x-goog-api-key`; the header // form keeps the token out of URLs and proxy access logs. - http << "x-goog-api-key: " << key_plain << "\r\n"; + http << "x-goog-api-key: " << key_sensitive.value << "\r\n"; } else { - http << "Authorization: Bearer " << key_plain << "\r\n"; + http << "Authorization: Bearer " << key_sensitive.value << "\r\n"; } } http << "Content-Length: " << body.size() << "\r\n"; @@ -697,12 +734,9 @@ void ApiClient::send_request(const Provider& p, Conn& c, http << "Connection: keep-alive\r\n"; http << "\r\n"; http << body; - - // Wipe the temporary unmasked key — the serialized request still holds - // it, but that buffer is wiped immediately after send() below. - if (!key_plain.empty()) { - OPENSSL_cleanse(key_plain.data(), key_plain.size()); - } + // key_sensitive destructor zeroes the key here. The `raw` buffer still + // holds it (streamed into the ostringstream above); that buffer is wiped + // immediately after send() completes. std::string raw = http.str(); int total = static_cast(raw.size()); diff --git a/src/api_server.cpp b/src/api_server.cpp index 32007e7..b71394a 100644 --- a/src/api_server.cpp +++ b/src/api_server.cpp @@ -8,6 +8,7 @@ #include "api_server.h" +#include "advisor.h" #include "commands.h" #include "config.h" #include "constitution.h" @@ -5203,6 +5204,89 @@ void handle_schedule_runs(int fd, int64_t task_id, const HttpRequest& /*req*/, write_json_response(fd, 200, out); } +// ── POST /v1/advise/gate ───────────────────────────────────────────────────── +// Stateless, one-shot advisor gate call. Exposes run_advisor_gate() to +// external callers that own their own executor loop and need to honour the +// gate verdict themselves. Bearer auth required; no orchestrator involved. +// +// Request body (JSON): +// advisor_model — provider-prefixed model id (required) +// prompt — gate system-prompt override (optional) +// original_task — the user's task given to the executor (required) +// terminating_text — the executor's text for its terminating turn (required) +// tool_summary — pre-formatted tool call summary, one line per call (optional) +// +// Response body (200): +// signal — "CONTINUE" | "REDIRECT" | "HALT" +// text — guidance (REDIRECT) or reason (HALT); "" for CONTINUE +// malformed — true when the advisor's reply was unparseable +void handle_advise_gate(int fd, const HttpRequest& req, + const std::map& api_keys) { + if (req.method != "POST") { + write_plain_response(fd, 405, "Method Not Allowed", "method not allowed\n"); + return; + } + + std::shared_ptr body; + try { body = json_parse(req.body); } + catch (const std::exception& e) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr(std::string("invalid JSON: ") + e.what()); + write_json_response(fd, 400, err); + return; + } + if (!body || !body->is_object()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("body must be a JSON object"); + write_json_response(fd, 400, err); + return; + } + + const std::string advisor_model = body->get_string("advisor_model", ""); + const std::string prompt_override = body->get_string("prompt", ""); + const std::string original_task = body->get_string("original_task", ""); + const std::string terminating_text = body->get_string("terminating_text", ""); + const std::string tool_summary = body->get_string("tool_summary", ""); + + if (advisor_model.empty()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("advisor_model is required"); + write_json_response(fd, 400, err); + return; + } + if (original_task.empty()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("original_task is required"); + write_json_response(fd, 400, err); + return; + } + if (terminating_text.empty()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("terminating_text is required"); + write_json_response(fd, 400, err); + return; + } + + AdvisorGateInput in; + in.original_task = original_task; + in.terminating_text = terminating_text; + in.tool_summary = tool_summary; + + ApiClient client(api_keys); + AdvisorGateOutput out = run_advisor_gate(client, advisor_model, prompt_override, in); + + const char* signal_str = "CONTINUE"; + if (out.kind == AdvisorGateOutput::Kind::Redirect) signal_str = "REDIRECT"; + else if (out.kind == AdvisorGateOutput::Kind::Halt) signal_str = "HALT"; + + auto resp = jobj(); + auto& m = resp->as_object_mut(); + m["signal"] = jstr(signal_str); + m["text"] = jstr(out.text); + m["malformed"] = jbool(out.malformed); + write_json_response(fd, 200, resp); +} + void handle_runs_list(int fd, const HttpRequest& req, TenantStore& tenants, const Tenant& tenant) { int64_t since = 0; @@ -6042,11 +6126,23 @@ MCPInvoker make_mcp_invoker_callback(std::shared_ptr mcp_mgr) { ExecInvoker make_exec_invoker_callback(const ApiServerOptions& opts, int64_t tenant_id) { SandboxManager* mgr = opts.sandbox; - if (!mgr) return nullptr; - return [mgr, tenant_id](const std::string& cmd) -> std::string { - SandboxExecResult r = mgr->exec(tenant_id, cmd); - return r.output; - }; + if (mgr) { + return [mgr, tenant_id](const std::string& cmd) -> std::string { + SandboxExecResult r = mgr->exec(tenant_id, cmd); + return r.output; + }; + } + // Host exec only when explicitly opted in AND no sandbox was requested. + // If sandbox_enabled is true but mgr is null (sandbox failed usability), + // refuse the host path — falling back silently would give wider access + // than the operator intended. They must remove ARBITER_SANDBOX_IMAGE + // to use host exec. + if (opts.host_exec_enabled && !opts.sandbox_enabled) { + return [](const std::string& cmd) -> std::string { + return cmd_exec(cmd); + }; + } + return nullptr; } // Returns nullptr when the configured provider isn't supported or no @@ -9558,6 +9654,78 @@ build_blocking_orchestrator(const ApiServerOptions& opts, return build_a2a_orchestrator(opts, tenants, tenant, err_out); } +// POST /v1/events — hardware/software event ingestion. +// +// Accepts a structured event envelope, routes it to the first agent whose +// constitution event_types patterns match the event's type (or uses the +// explicit "agent" override), formats the event as a natural-language +// message, and delegates to handle_orchestrate for SSE streaming. The +// agent sees the event as a normal turn and may use any writ (/exec, /write, +// /agent, etc.) in its response. +void handle_event_ingest(int fd, HttpRequest req, + const ApiServerOptions& opts, + TenantStore& tenants, + InFlightRegistry& in_flight, + BillingClient* billing, + const std::string& workspace_id, + const Tenant& tenant, + RequestEventBus* request_events) { + if (req.method != "POST") { + write_plain_response(fd, 405, "Method Not Allowed", "method not allowed\n"); + return; + } + + std::shared_ptr body; + try { body = json_parse(req.body); } + catch (const std::exception& e) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr(std::string("invalid JSON: ") + e.what()); + write_json_response(fd, 400, err); + return; + } + if (!body || !body->is_object()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("body must be a JSON object"); + write_json_response(fd, 400, err); + return; + } + + const std::string event_type = body->get_string("type", ""); + if (event_type.empty()) { + auto err = jobj(); + err->as_object_mut()["error"] = jstr("\"type\" is required"); + write_json_response(fd, 400, err); + return; + } + + const std::string source = body->get_string("source", ""); + const std::string agent = body->get_string("agent", ""); + auto payload_val = body->get("payload"); + + // Route: explicit override wins; otherwise scan agent constitutions. + std::string agent_id = agent.empty() + ? route_event(opts.agents_dir, event_type) + : agent; + + // Format event as a natural-language message the agent can reason about. + std::string message = "Event: " + event_type; + if (!source.empty()) message += "\nSource: " + source; + if (payload_val) message += "\nPayload: " + json_serialize(*payload_val); + + // Rewrite request body as orchestrate format and delegate. + // handle_orchestrate reads req.body["message"] and uses agent_override + // to target the routed agent; everything else (SSE, auth, billing, + // writ execution) flows through the existing path unchanged. + auto synth = jobj(); + synth->as_object_mut()["message"] = jstr(message); + req.body = json_serialize(*synth); + + handle_orchestrate(fd, req, opts, tenants, in_flight, + billing, workspace_id, tenant, + request_events, + /*agent_override=*/agent_id); +} + // ─── ApiServer public API ─────────────────────────────────────────────────── ApiServer::ApiServer(ApiServerOptions opts, TenantStore& tenants) @@ -10594,6 +10762,30 @@ void ApiServer::handle_connection(int fd) { handle_notifications_stream(fd, *tenant, *notifications_); return; } + + // ── Advisor gate ────────────────────────────────────────────────── + // POST /v1/advise/gate — stateless gate verdict for external callers + if (segs.size() == 3 && segs[0] == "v1" && segs[1] == "advise" + && segs[2] == "gate") { + return handle_advise_gate(fd, req, opts_.api_keys); + } + + // ── Event ingestion ─────────────────────────────────────────────── + // POST /v1/events — hardware/software event → agent routing + if (segs.size() == 2 && segs[0] == "v1" && segs[1] == "events") { + auto lim = limiter_->acquire(tenant->id); + if (!lim.granted()) { + write_429_response(fd, lim.retry_after_seconds, + lim.kind == TenantLimiter::Result::Kind::ConcurrentExceeded + ? "concurrent_request_limit" + : "rate_limit", + metrics_.get(), tenant->id); + return; + } + return handle_event_ingest(fd, req, opts_, tenants_, in_flight_, + billing_.get(), workspace_id, *tenant, + request_events_.get()); + } } write_plain_response(fd, 404, "Not Found", "endpoint not found\n"); diff --git a/src/cli.cpp b/src/cli.cpp index cf19e23..333c2c5 100644 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -180,7 +180,8 @@ void cmd_init(bool force) { } } -void cmd_api(int port, const std::string& bind, bool verbose) { +void cmd_api(int port, const std::string& bind, bool verbose, + bool allow_host_exec) { // Pick up ARBITER_LOG_FORMAT before any of the startup-path log // calls fire — structured JSON deployments expect every line on // stderr to be machine-parseable. @@ -227,6 +228,24 @@ void cmd_api(int port, const std::string& bind, bool verbose) { opts.memory_root = dir + "/memory"; // per-tenant subdirs land under here opts.api_keys = std::move(api_keys); opts.exec_disabled = true; // SaaS default: no shell + // Host exec opt-in: CLI flag or ARBITER_ALLOW_HOST_EXEC=1. Sandbox + // takes precedence when both are set (make_exec_invoker_callback checks + // sandbox first). + { + bool host_exec = allow_host_exec; + if (!host_exec) { + const char* env = std::getenv("ARBITER_ALLOW_HOST_EXEC"); + host_exec = (env && env[0] == '1' && env[1] == '\0'); + } + if (host_exec) { + opts.host_exec_enabled = true; + opts.exec_disabled = false; + ::fprintf(stderr, + "WARN: host exec enabled — agents can run shell commands " + "as this process (uid %d)\n", + (int)::getuid()); + } + } opts.admin_token = admin_token; opts.log_verbose = log_verbose; // MCP registry — file is optional. If present, every /v1/orchestrate diff --git a/src/commands.cpp b/src/commands.cpp index 6e507f4..25d0802 100644 --- a/src/commands.cpp +++ b/src/commands.cpp @@ -35,16 +35,27 @@ std::vector parse_agent_commands(const std::string& response) { std::vector result; std::istringstream ss(response); std::string line; - bool in_code_block = false; + // Track open code fence by its exact opening sequence ("```" or "~~~"). + // A fence only closes when a line starts with the same sequence that + // opened it — mismatched pairs don't cross-close and can't be used to + // escape a block mid-stream. Empty = not currently inside a fence. + std::string current_fence; while (std::getline(ss, line)) { - // Track code fences (``` or ~~~) - if (line.size() >= 3 && - (line.substr(0, 3) == "```" || line.substr(0, 3) == "~~~")) { - in_code_block = !in_code_block; - continue; + // Track code fences (``` or ~~~), matched by opening sequence. + if (line.size() >= 3) { + std::string pfx = line.substr(0, 3); + if (pfx == "```" || pfx == "~~~") { + if (current_fence.empty()) { + current_fence = pfx; // open + } else if (current_fence == pfx) { + current_fence.clear(); // close — only when sequence matches + } + // mismatched fence (open "```", see "~~~") → stay in block + continue; + } } - if (in_code_block) continue; + if (!current_fence.empty()) continue; // Trim trailing whitespace / CR while (!line.empty() && (line.back() == ' ' || line.back() == '\r')) @@ -2598,6 +2609,17 @@ std::string execute_agent_commands(const std::vector& cmds, } } + // Duplicate agent_id detection — each id must appear at most once + // in the batch; two concurrent calls would race the same history. + std::string parallel_dup_id; + { + std::set seen_ids; + for (const auto& [id, msg] : children) { + (void)msg; + if (!seen_ids.insert(id).second) { parallel_dup_id = id; break; } + } + } + block << "[/parallel " << children.size() << " children]\n"; if (cmd.truncated) { block << "ERR: /parallel block was cut off (missing " @@ -2608,6 +2630,12 @@ std::string execute_agent_commands(const std::vector& cmds, "/agent is permitted inside " "/parallel.../endparallel\n"; cache_result = false; + } else if (!parallel_dup_id.empty()) { + block << "ERR: /parallel cannot invoke '" << parallel_dup_id + << "' twice in one batch — each agent_id must appear" + " at most once. Use separate /agent calls or" + " different agent ids.\n"; + cache_result = false; } else if (!parallel_invoker) { block << "ERR: /parallel unavailable in this context\n"; cache_result = false; diff --git a/src/constitution.cpp b/src/constitution.cpp index 355089a..5c0392c 100644 --- a/src/constitution.cpp +++ b/src/constitution.cpp @@ -917,6 +917,12 @@ std::string Constitution::to_json() const { m["capabilities"] = cap; } + if (!event_types.empty()) { + auto et = jarr(); + for (auto& e : event_types) et->as_array_mut().push_back(jstr(e)); + m["event_types"] = et; + } + // Memory block — only emit when at least one toggle deviates from // the default, so round-tripping a default config stays compact. Constitution::MemoryConfig defaults; @@ -1043,6 +1049,12 @@ Constitution Constitution::from_json(const std::string& json_str) { if (v && v->is_string()) c.capabilities.push_back(v->as_string()); } } + auto et_val = root->get("event_types"); + if (et_val && et_val->is_array()) { + for (auto& v : et_val->as_array()) { + if (v && v->is_string()) c.event_types.push_back(v->as_string()); + } + } return c; } diff --git a/src/main.cpp b/src/main.cpp index dbdebb8..aa57c81 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -114,7 +114,7 @@ static void getc_flush_output() { // ───────────────────────────────────────────────────────────────────────────── -static void cmd_interactive() { +static void cmd_interactive(bool exec_allowed_flag = true) { std::string dir = get_config_dir(); auto api_keys = get_api_keys(); @@ -127,6 +127,8 @@ static void cmd_interactive() { // turn checks it on the fly — no need to rebuild the filter when the // flag changes. Not persisted across sessions by design. arbiter::Config cfg; + cfg.exec_allowed = exec_allowed_flag; + orch.set_exec_disabled(!cfg.exec_allowed); LoopManager loops; // Each pane's exec thread sets ::g_active_pane (file-scope thread_local) @@ -232,8 +234,18 @@ static void cmd_interactive() { // Layout calls this when splitting to materialise a new pane with every // callback wired to app-scope state. Defined before orch callbacks so // their captures of active_pane/layout_ptr are consistent. + // + // Declared here so make_pane can capture it by reference; the lambda + // body is assigned in the pump setup block below once the CV is live. + std::function pump_notify; + auto make_pane = [&]() -> std::unique_ptr { auto p = std::make_unique(); + // Wire pump wakeup so any output push wakes the drain thread + // immediately rather than waiting for the next 30ms tick. + p->output_queue.set_notify_fn([&pump_notify](){ + if (pump_notify) pump_notify(); + }); p->current_agent = "index"; p->current_model = orch.get_agent_model(p->current_agent); p->tui.init(p->current_agent, p->current_model, @@ -415,6 +427,28 @@ static void cmd_interactive() { // clean start (no welcome), since splitting is an explicit user action. layout.focused().tui.draw_welcome(layout.focused().history); + // Exec-capability warning — list any agents that can run shell commands. + // Queued here so the pump thread renders it below the welcome card on its + // first tick. Omitted when exec is globally disabled (--no-exec). + if (cfg.exec_allowed) { + std::vector exec_agents; + for (const auto& id : orch.list_agents_all()) { + for (const auto& cap : orch.get_constitution(id).capabilities) { + if (cap == "exec") { exec_agents.push_back(id); break; } + } + } + if (!exec_agents.empty()) { + std::string names; + for (size_t i = 0; i < exec_agents.size(); ++i) { + if (i) names += ", "; + names += exec_agents[i]; + } + layout.focused().output_queue.push_msg( + "\033[2m[exec enabled: " + names + + " \xe2\x80\x94 shell commands will run as you]\033[0m"); + } + } + // Pump thread reads through g_getc_state.pane, which tracks the focused // pane for SIGWINCH repaints (output drain iterates all panes directly). g_getc_state.pane = &layout.focused(); @@ -1195,12 +1229,30 @@ static void cmd_interactive() { // different agents execute simultaneously (same-provider sends still // serialize at ApiClient's connection mutex, which is a network-layer // constraint rather than an app-level one). - start_pane_thread(layout.focused()); - // ── Output pump ──────────────────────────────────────────────────────── // Drains every pane's output_queue every tick and repaints its scroll // region. Holds layout_mu for the whole iteration so a concurrent // split/close/focus on the main thread can't mutate the tree mid-walk. + // + // The pump wakes immediately when any pane's OutputQueue receives data + // (via the notify_fn_ callback) and falls back to a 30ms poll so + // SIGWINCH repaints and the stop signal are still serviced promptly. + std::mutex pump_cv_mu; + std::condition_variable pump_cv; + bool pump_notified = false; + + // Assign the notify function before starting any exec thread so the + // callback is fully visible to any thread that calls push(). + pump_notify = [&]() { + { std::lock_guard lk(pump_cv_mu); pump_notified = true; } + pump_cv.notify_one(); + }; + + // Start exec thread after pump_notify is assigned — the exec thread + // captures pump_notify by reference via OutputQueue::notify_fn_ and + // may call push() on its first tick. + start_pane_thread(layout.focused()); + std::atomic pump_stop{false}; std::thread output_pump([&]() { auto flush_pane = [&](Pane& p) { @@ -1216,6 +1268,13 @@ static void cmd_interactive() { p.new_while_scrolled); }; while (!pump_stop.load()) { + // Wait for data or the 30ms fallback tick (covers SIGWINCH and stop). + { + std::unique_lock wlk(pump_cv_mu); + pump_cv.wait_for(wlk, std::chrono::milliseconds(30), + [&]{ return pump_notified || pump_stop.load(); }); + pump_notified = false; + } std::unique_lock lk(layout_mu); if (g_winch) { g_winch = 0; @@ -1233,8 +1292,6 @@ static void cmd_interactive() { layout.focused().editor.interrupt(); } layout.for_each_pane(flush_pane); - lk.unlock(); - std::this_thread::sleep_for(std::chrono::milliseconds(30)); } // Final drain — no need to lock here; exec threads have been joined // by this point (shutdown ordering), so no other thread is mutating. @@ -1482,6 +1539,7 @@ static void cmd_interactive() { } pump_stop = true; + pump_cv.notify_one(); // unblock the pump's wait_for so it exits promptly output_pump.join(); if (stdin_is_tty) ::tcsetattr(STDIN_FILENO, TCSANOW, &orig_stdin_tm); @@ -1513,6 +1571,10 @@ int main(int argc, char* argv[]) { std::string arg1 = argv[1]; + if (arg1 == "--no-exec") { + cmd_interactive(false); + return 0; + } if (arg1 == "--init" || arg1 == "init") { // arbiter --init [--force] // Without --force, --init preserves existing agent JSON files @@ -1530,10 +1592,11 @@ int main(int argc, char* argv[]) { return 0; } if (arg1 == "--api" || arg1 == "api") { - // arbiter --api [--port N] [--bind ADDR] [--verbose] + // arbiter --api [--port N] [--bind ADDR] [--verbose] [--allow-host-exec] int port = 8080; std::string bind = "127.0.0.1"; bool verbose = false; + bool allow_host_exec = false; for (int i = 2; i < argc; ) { std::string k = argv[i]; if (k == "--verbose" || k == "-v") { @@ -1541,6 +1604,11 @@ int main(int argc, char* argv[]) { ++i; continue; } + if (k == "--allow-host-exec") { + allow_host_exec = true; + ++i; + continue; + } if (i + 1 >= argc) { std::cerr << "--api flag '" << k << "' requires a value\n"; return 1; @@ -1554,7 +1622,7 @@ int main(int argc, char* argv[]) { } i += 2; } - arbiter::cmd_api(port, bind, verbose); + arbiter::cmd_api(port, bind, verbose, allow_host_exec); return 0; } if (arg1 == "--send" || arg1 == "send") { @@ -1608,10 +1676,15 @@ int main(int argc, char* argv[]) { std::cout << "Usage:\n" " arbiter Interactive REPL\n" - " arbiter --api [--port N] [--bind ADDR] [--verbose]\n" + " arbiter --no-exec Interactive REPL with /exec disabled\n" + " (agents cannot run shell commands)\n" + " arbiter --api [--port N] [--bind ADDR] [--verbose] [--allow-host-exec]\n" " HTTP+SSE orchestration API (default 127.0.0.1:8080).\n" " --verbose mirrors every SSE event (text deltas, tool calls,\n" " thinking, etc.) to stderr. Env: ARBITER_API_VERBOSE=1.\n" + " --allow-host-exec permits agents to run shell commands on\n" + " the host via popen(). WARNING: agents run as this process's\n" + " user. Also: ARBITER_ALLOW_HOST_EXEC=1.\n" " arbiter --send One-shot message\n" " arbiter --init [--force] Initialize config + example agents\n" " --force overwrites existing ~/.arbiter/agents/*.json files;\n" diff --git a/src/orchestrator.cpp b/src/orchestrator.cpp index 53b6c6d..e6a7edb 100644 --- a/src/orchestrator.cpp +++ b/src/orchestrator.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -93,7 +94,8 @@ std::string pick_master_model_default( } // namespace Orchestrator::Orchestrator(std::map api_keys) - : client_(api_keys) // copy — the map is tiny, we still need it below + : client_(api_keys), // copy into client_ — keys are tiny + api_keys_(api_keys) // second copy for per-child clients in /parallel { // Default memory directory is cwd-scoped ($PWD/.arbiter/memory) memory_dir_ = (fs::current_path() / ".arbiter" / "memory").string(); @@ -171,6 +173,29 @@ void Orchestrator::load_agents(const std::string& dir) { } } +std::string route_event(const std::string& agents_dir, + const std::string& event_type) { + if (!fs::is_directory(agents_dir)) return "index"; + for (auto& entry : fs::directory_iterator(agents_dir)) { + if (entry.path().extension() != ".json") continue; + try { + auto config = Constitution::from_file(entry.path().string()); + for (const auto& pattern : config.event_types) { + if (fnmatch(pattern.c_str(), event_type.c_str(), 0) == 0) { + std::string id = config.name.empty() + ? entry.path().stem().string() + : config.name; + return id; + } + } + } catch (const std::exception& e) { + fprintf(stderr, "WARN: route_event skip %s: %s\n", + entry.path().c_str(), e.what()); + } + } + return "index"; +} + // Build an AgentInvoker that runs a sub-agent through the full dispatch loop. AgentInvoker Orchestrator::make_invoker(const std::string& caller_id, int depth, std::map* shared_cache, @@ -298,6 +323,22 @@ ParallelInvoker Orchestrator::make_parallel_invoker(const std::string& caller_id // a fan-out, not a chained continuation. // • The shared dedup cache is intentionally not propagated // between siblings — see make_invoker's note. + // Give each child its own ApiClient so their LLM calls run concurrently + // instead of serializing on the parent's conn_mutex_. Children are + // wired with the same circuit breaker and metrics as the parent. + std::vector> child_clients; + child_clients.reserve(kids.size()); + for (size_t i = 0; i < kids.size(); ++i) { + child_clients.push_back(std::make_unique(api_keys_)); + child_clients.back()->set_circuit_breaker(client_.circuit_breaker()); + child_clients.back()->set_metrics(client_.metrics()); + } + // Register child clients so cancel() can reach them while threads run. + { + std::lock_guard lk(parallel_clients_mu_); + for (auto& c : child_clients) parallel_clients_.push_back(c.get()); + } + std::vector threads; std::vector results(kids.size()); threads.reserve(kids.size()); @@ -306,7 +347,7 @@ ParallelInvoker Orchestrator::make_parallel_invoker(const std::string& caller_id const std::string sub_id = kids[i].first; const std::string sub_msg = kids[i].second; threads.emplace_back([this, i, sub_id, sub_msg, caller_id, depth, - original_query, &results]() { + original_query, &results, &child_clients]() { // Basic validations — mirror make_invoker's gates. if (sub_id == caller_id) { results[i] = "ERR: agent cannot invoke itself"; @@ -393,9 +434,9 @@ ParallelInvoker Orchestrator::make_parallel_invoker(const std::string& caller_id } // Fresh ephemeral Agent for this child — independent - // history_, independent stats_, no race with siblings or - // the canonical agent registered in agents_. - Agent ephemeral(sub_id, std::move(cfg_copy), client_); + // history_, independent stats_, independent ApiClient + // (its own connection pool so LLM calls run concurrently). + Agent ephemeral(sub_id, std::move(cfg_copy), *child_clients[i]); if (compact_cb_) ephemeral.set_compact_callback(compact_cb_); std::map local_cache; @@ -411,6 +452,18 @@ ParallelInvoker Orchestrator::make_parallel_invoker(const std::string& caller_id } for (auto& t : threads) t.join(); + + // Unregister child clients — they're about to be destroyed. + { + std::lock_guard lk(parallel_clients_mu_); + for (auto& c : child_clients) { + auto it = std::find(parallel_clients_.begin(), + parallel_clients_.end(), c.get()); + if (it != parallel_clients_.end()) + parallel_clients_.erase(it); + } + } + return results; }; } @@ -1768,11 +1821,15 @@ static std::vector messages_from_json(const JsonValue* arr) { } void Orchestrator::cancel() { - // All agents and the master share the same ApiClient instance. - // One cancel() call interrupts any in-progress streaming across the board. client_.cancel(); + // Also cancel any per-child clients active inside a /parallel turn. + std::lock_guard lk(parallel_clients_mu_); + for (ApiClient* c : parallel_clients_) c->cancel(); } +static constexpr size_t kSessionWarnBytes = 4 * 1024 * 1024; // 4 MB total +static constexpr size_t kAgentWarnBytes = 512 * 1024; // per-agent + void Orchestrator::save_session(const std::string& path) const { auto root = jobj(); auto& m = root->as_object_mut(); @@ -1792,8 +1849,35 @@ void Orchestrator::save_session(const std::string& path) const { } m["agents"] = agents_obj; + std::string serialized = json_serialize(*root); + + if (serialized.size() > kSessionWarnBytes) { + // Log which agents contributed large histories so the user knows + // which to /compact when the file grows unwieldy. Includes the + // index master since it can independently be the source of bloat. + std::string over_limit; + auto check_agent = [&](const std::string& id, const Agent& a) { + std::string blob = json_serialize(*messages_to_json(a.history())); + if (blob.size() > kAgentWarnBytes) { + if (!over_limit.empty()) over_limit += ", "; + over_limit += id + " (" + std::to_string(blob.size() / 1024) + " KB)"; + } + }; + { + check_agent("index", *index_master_); + std::lock_guard lk(agents_mutex_); + for (auto& [id, agent] : agents_) check_agent(id, *agent); + } + ::fprintf(stderr, + "WARN: session file is %.1f MB (limit %.0f MB)%s\n" + " Run /compact [agent] to trim histories and keep startup fast.\n", + serialized.size() / (1024.0 * 1024.0), + kSessionWarnBytes / (1024.0 * 1024.0), + over_limit.empty() ? "" : (" — large agents: " + over_limit).c_str()); + } + std::ofstream f(path); - if (f.is_open()) f << json_serialize(*root); + if (f.is_open()) f << serialized; } bool Orchestrator::load_session(const std::string& path) { @@ -1805,6 +1889,13 @@ bool Orchestrator::load_session(const std::string& path) { std::string raw = ss.str(); if (raw.empty()) return false; + if (raw.size() > kSessionWarnBytes) { + ::fprintf(stderr, + "WARN: session file is %.1f MB — startup may be slow. " + "Run /compact [agent] to trim histories.\n", + raw.size() / (1024.0 * 1024.0)); + } + try { auto root = json_parse(raw); bool any_restored = false; diff --git a/src/repl/queues.cpp b/src/repl/queues.cpp index afcaaeb..beba232 100644 --- a/src/repl/queues.cpp +++ b/src/repl/queues.cpp @@ -47,22 +47,32 @@ void CommandQueue::drain() { // ─── OutputQueue ───────────────────────────────────────────────────────────── -void OutputQueue::push(const std::string& s) { +void OutputQueue::set_notify_fn(std::function fn) { std::lock_guard lk(mu_); - if (s.empty()) return; - if (need_sep_) { - // Materialise the pending separator as exactly `\n\n`. Strip any - // trailing newlines from the buffer first so multi-line content - // (markdown-rendered lines that end with `\n`) gets one blank line - // between messages, not two or more. If the buffer was drained in - // between, buf_ is empty and we emit a leading `\n\n` — the prior - // drain's content already ends with the trailing-content newline, - // so `A` + next drain `\n\nB` renders as `A` + blank + `B`. - while (!buf_.empty() && buf_.back() == '\n') buf_.pop_back(); - buf_ += "\n\n"; - need_sep_ = false; + notify_fn_ = std::move(fn); +} + +void OutputQueue::push(const std::string& s) { + std::function fn; + { + std::lock_guard lk(mu_); + if (s.empty()) return; + if (need_sep_) { + // Materialise the pending separator as exactly `\n\n`. Strip any + // trailing newlines from the buffer first so multi-line content + // (markdown-rendered lines that end with `\n`) gets one blank line + // between messages, not two or more. If the buffer was drained in + // between, buf_ is empty and we emit a leading `\n\n` — the prior + // drain's content already ends with the trailing-content newline, + // so `A` + next drain `\n\nB` renders as `A` + blank + `B`. + while (!buf_.empty() && buf_.back() == '\n') buf_.pop_back(); + buf_ += "\n\n"; + need_sep_ = false; + } + buf_ += s; + fn = notify_fn_; // copy under lock; invoke outside to avoid inversion } - buf_ += s; + if (fn) fn(); } void OutputQueue::end_message() { diff --git a/tests/test_commands.cpp b/tests/test_commands.cpp index d5d1db4..a5c93b2 100644 --- a/tests/test_commands.cpp +++ b/tests/test_commands.cpp @@ -189,6 +189,61 @@ TEST_CASE("parse_agent_commands skips code fences") { CHECK(cmds[0].args == "ls"); } +TEST_CASE("parse_agent_commands fence: tilde fence skips commands inside") { + std::string response = "~~~\n/exec rm -rf /\n~~~\n/exec echo ok\n"; + auto cmds = parse_agent_commands(response); + REQUIRE(cmds.size() == 1); + CHECK(cmds[0].name == "exec"); + CHECK(cmds[0].args == "echo ok"); +} + +TEST_CASE("parse_agent_commands fence: mismatched open/close does not escape block") { + // Opened with ```, attempted close with ~~~ — block must stay open. + // The /exec after the mismatch is still inside the fence. + // Only a matching ``` can close it. + std::string response = "```\n/exec danger\n~~~\n/exec still_inside\n```\n/exec outside\n"; + auto cmds = parse_agent_commands(response); + REQUIRE(cmds.size() == 1); + CHECK(cmds[0].name == "exec"); + CHECK(cmds[0].args == "outside"); +} + +TEST_CASE("parse_agent_commands fence: info string on opening line") { + // ```python or ```cpp should open a fence just like a bare ``` + std::string response = "```python\n/exec echo inside\n```\n/exec echo outside\n"; + auto cmds = parse_agent_commands(response); + REQUIRE(cmds.size() == 1); + CHECK(cmds[0].name == "exec"); + CHECK(cmds[0].args == "echo outside"); +} + +TEST_CASE("parse_agent_commands fence: unclosed fence suppresses trailing commands") { + // An unclosed ``` means everything after is inside the block. + std::string response = "```\n/exec danger\n/exec also_danger\n"; + auto cmds = parse_agent_commands(response); + CHECK(cmds.empty()); +} + +TEST_CASE("parse_agent_commands fence: nested open attempt is ignored") { + // Once inside a ``` block, a second ``` line is not a new fence — it + // is treated as a close of the outer block. + std::string response = "```\n```\n/exec this_is_outside\n"; + auto cmds = parse_agent_commands(response); + REQUIRE(cmds.size() == 1); + CHECK(cmds[0].args == "this_is_outside"); +} + +TEST_CASE("parse_agent_commands fence: empty file is safe") { + auto cmds = parse_agent_commands(""); + CHECK(cmds.empty()); +} + +TEST_CASE("parse_agent_commands fence: only fence markers, no commands") { + std::string response = "```\n```\n"; + auto cmds = parse_agent_commands(response); + CHECK(cmds.empty()); +} + TEST_CASE("parse_agent_commands recognises /help with and without topic") { { auto cmds = parse_agent_commands("/help\n"); @@ -400,6 +455,43 @@ TEST_CASE("/mem add entry dispatcher rejects empty body and unclosed block") { } } +TEST_CASE("/parallel duplicate agent_id produces explicit error") { + // When the same agent_id appears more than once in a /parallel block, + // execute_agent_commands must reject it with the exact error string + // defined in commands.cpp before calling the parallel_invoker. + + // Build a /parallel command whose body lists the same agent twice. + AgentCommand cmd; + cmd.name = "parallel"; + cmd.args = ""; + cmd.content = "/agent researcher find patterns\n/agent researcher find more\n"; + cmd.truncated = false; + + std::vector cmds; + cmds.push_back(cmd); + + // The parallel_invoker must NOT be called — set it to one that panics + // so we detect any accidental call. + bool invoker_called = false; + auto guard_invoker = [&](const std::vector>&) + -> std::vector { + invoker_called = true; + return {}; + }; + + auto out = execute_agent_commands( + cmds, "index", "", + /*agent_invoker=*/nullptr, /*confirm=*/nullptr, + /*dedup_cache=*/nullptr, /*advisor_invoker=*/nullptr, + /*tool_status=*/nullptr, /*pane_spawner=*/nullptr, + /*write_interceptor=*/nullptr, /*exec_disabled=*/false, + /*parallel_invoker=*/guard_invoker); + + CHECK(!invoker_called); + CHECK(out.find("ERR: /parallel cannot invoke 'researcher' twice in one batch") != std::string::npos); + CHECK(out.find("each agent_id must appear at most once") != std::string::npos); +} + TEST_CASE("/help dispatch returns topic body or ERR for unknown topic") { // The /help dispatch path needs no callbacks — it reads from the // help corpus baked into commands.cpp. Smoke-test the three shapes: From 0cc4f90d4b9234b5d9ffda4e3baba6b4cb11bddc Mon Sep 17 00:00:00 2001 From: Tyler Reckart Date: Tue, 23 Jun 2026 10:32:15 -0400 Subject: [PATCH 3/4] work on docs --- docs/api/events.md | 89 ++++ docs/api/index.md | 3 +- examples/3bo/BOM.md | 136 +---- examples/3bo/JETSON.md | 96 ++-- examples/3bo/README.md | 1 + examples/3bo/VISION.md | 501 ++---------------- examples/3bo/agents/3bo-events.json | 18 + examples/3bo/agents/3bo-voice.json | 16 + examples/3bo/agents/threebo-monitor.json | 26 + .../threebo_config.example.h | 15 +- .../threebo_nano_esp32/threebo_nano_esp32.ino | 51 +- examples/voice-bridge/bridge.py | 128 ++++- 12 files changed, 446 insertions(+), 634 deletions(-) create mode 100644 docs/api/events.md create mode 100644 examples/3bo/agents/3bo-events.json create mode 100644 examples/3bo/agents/3bo-voice.json create mode 100644 examples/3bo/agents/threebo-monitor.json diff --git a/docs/api/events.md b/docs/api/events.md new file mode 100644 index 0000000..64cd20f --- /dev/null +++ b/docs/api/events.md @@ -0,0 +1,89 @@ +# `POST /v1/events` + +**Auth:** tenant — _Status:_ experimental + +Turns a structured hardware or software event into a full Arbiter run. The runtime routes the event to an agent, supplies that agent with its normal memory and tools, and streams the resulting reasoning and actions as Server-Sent Events. + +Use this endpoint for application webhooks, infrastructure alerts, sensor readings, edge-device signals, robotics bridges, and other systems that produce events rather than conversational prompts. + +## Request + +### Body + +| Field | Type | Required | Description | +|---|---|---|---| +| `type` | string | yes | Event type used for agent routing, such as `sensor.temperature.threshold` or `deployment.failed`. | +| `source` | string | no | Human-readable source identifier, such as `edge/rack-04` or `github/acme/api`. | +| `payload` | any JSON value | no | Event-specific data supplied to the selected agent. | +| `agent` | string | no | Explicit agent id. When present, bypasses type-based routing. | + +### Headers + +| Header | Required | Purpose | +|---|---|---| +| `Authorization` | yes | `Bearer `. See [authentication](../concepts/authentication.md). | +| `Content-Type` | yes | `application/json`. | + +```bash +curl -N http://127.0.0.1:8080/v1/events \ + -H "Authorization: Bearer $ARBITER_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "type": "sensor.temperature.threshold", + "source": "edge/rack-04", + "payload": { "celsius": 84.6 } + }' +``` + +## Routing + +File-backed agents opt into events with `event_types` in their constitution. Each entry is a glob matched against the event type. Arbiter scans the JSON definitions in its configured agents directory; tenant agents created through `POST /v1/agents` are not part of automatic event routing in this experimental version. If no file-backed agent matches, Arbiter routes the event to `index`. + +```json +{ + "name": "facilities", + "model": "ollama/qwen3.6", + "event_types": [ + "sensor.*", + "facility.alert.*" + ], + "capabilities": ["exec"] +} +``` + +An explicit `agent` in the request body takes precedence over `event_types` routing. + +Keep routing patterns distinct. If multiple agents match an event in this experimental implementation, the first matching agent definition is selected. + +## What the agent receives + +Arbiter presents the event to the selected agent as a normal turn: + +```text +Event: sensor.temperature.threshold +Source: edge/rack-04 +Payload: {"celsius":84.6} +``` + +The selected agent can use the same memory, delegation, MCP, artifact, search, and permitted execution capabilities available to a direct orchestration request. + +Event payloads are input data, but they are visible to the model. Treat event sources as untrusted, grant each routed agent only the capabilities it requires, and leave host execution disabled for externally sourced events. Prefer the tenant sandbox when an agent must execute commands. + +## Response + +The response is `text/event-stream` and follows the same lifecycle as [`POST /v1/orchestrate`](orchestrate.md): `request_received`, agent and tool activity, advisor decisions, and a terminal `done` event. + +## Failure modes + +| Status | When | Body | +|---|---|---| +| 400 | Body is not a JSON object, JSON is invalid, or `type` is missing. | `{"error":"..."}` | +| 401 | Bearer token is missing or invalid, or the tenant is disabled. | `{"error":"..."}` | +| 200 + `done.ok = false` | The routed run fails after the SSE stream opens. | SSE `error` followed by `done`. | + +## See also + +- [`POST /v1/orchestrate`](orchestrate.md) — direct request ingestion. +- [Agent data model](../concepts/data-model.md) — agent constitution fields. +- [SSE event catalog](../concepts/sse-events.md) — streamed event shapes. +- [Authentication](../concepts/authentication.md) — tenant bearer tokens. diff --git a/docs/api/index.md b/docs/api/index.md index 43010f9..21e8fd5 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -1,6 +1,6 @@ # Arbiter HTTP API -Arbiter exposes its multi-agent orchestrator as an HTTP + Server-Sent Events API. One `POST /v1/orchestrate` drives the full agentic loop — master agent turns, delegated and parallel sub-agent calls, tool invocations, generated files — and streams the whole thing back as SSE events. +Arbiter exposes its reasoning runtime as an HTTP + Server-Sent Events API. Send a direct request with `POST /v1/orchestrate`, or ingest a structured hardware or software event with `POST /v1/events`. Both drive the full agentic loop — routing, durable context, delegated and parallel sub-agent calls, tool invocations, and generated files — and stream the whole execution back as SSE events. Billing — eligibility checks, rate cards, caps, invoicing — is delegated to an external billing service when `ARBITER_BILLING_URL` is set. The runtime exchanges every bearer for a workspace_id via `POST /v1/runtime/auth/validate`, pre-flights against `POST /v1/runtime/quota/check`, and posts post-turn telemetry to `POST /v1/runtime/usage/record`. Operators wanting a commercial deployment must implement that protocol against a service of their choosing — arbiter ships no reference implementation under this repository. With the env var unset, the runtime acts as a thin pass-through using the operator-supplied provider keys, with no eligibility checks. @@ -34,6 +34,7 @@ Each endpoint page below uses the same template: **Function**, **Request**, **Re - [`GET /v1/health`](health.md) - [`GET /v1/metrics`](metrics.md) - [`GET /v1/models`](models.md) +- [`POST /v1/events`](events.md) - [`POST /v1/orchestrate`](orchestrate.md) - [`POST /v1/requests/:id/cancel`](requests-cancel.md) - [`GET /v1/requests`](requests/list.md) diff --git a/examples/3bo/BOM.md b/examples/3bo/BOM.md index 6885657..7b37953 100644 --- a/examples/3bo/BOM.md +++ b/examples/3bo/BOM.md @@ -147,36 +147,21 @@ For the first portable-power add-on: ## Power budget notes -Treat the Jetson as a separate high-current compute load. Use the -vendor-recommended Jetson supply during bench bring-up. For an enclosed -single-input robot, add a dedicated regulator sized for the exact Jetson carrier -input and power budget. - -The Nano ESP32 should be powered from the Jetson over the same USB-C cable used -for the serial link. Do not connect the battery pack or a battery-derived buck -regulator to Nano `VIN` in the wired Jetson build. This keeps the battery -system concerned only with the Jetson input rail and avoids USB/VIN backfeed -questions. - -The 5 V LED/audio rail must be treated as a measured USB-powered load. For the -quiet first prototype, keep NeoPixel brightness and speaker volume low and -verify the Jetson USB port, Nano board, and wiring stay within their safe -current limits. If the MAX98357A and NeoPixel stick need more current than the -USB/Nano path can provide, add a Jetson-powered USB hub or current-limited 5 V -accessory rail; do not add a separate battery branch just for the Nano body. - -A 4S 5000 mAh pack is about 74 Wh before conversion losses and reserve: - -```text -14.8 V nominal * 5 Ah = 74 Wh -``` - -Budget a real-world usable fraction rather than draining the pack flat. A -Jetson-heavy STT/TTS workload can pull the robot into the 25-35 W range, so a -5000 mAh 4S pack is a roughly 1.5-2.5 hour prototype battery, not an all-day -power source. Portable battery mode requires a hard low-voltage cutoff or a -protected 4S pack/BMS. A balance-plug buzzer is useful while you are nearby on -the bench, but it is not a product safety mechanism. +Power the Nano ESP32 from the Jetson USB port only — don't connect `VIN` while +USB is live. Keep NeoPixel brightness and speaker volume low until you've +measured the body rail current. A 4S 5000 mAh pack at 25–35 W Jetson load is +roughly 1.5–2.5 hours; use a low-voltage cutoff or protected BMS for unattended +use. + +Planning current by load: + +| Load | Planning current | +|---|---| +| Jetson Orin Nano Super | 7–25 W depending on power mode | +| Nano ESP32 over USB | 150–300 mA bursts | +| MAX98357A speaker path | 50–600 mA (keep low on USB power) | +| NeoPixel Stick 8 RGBW | 100–500 mA (cap brightness aggressively) | +| ICS-43434 microphone | < 1 mA | Before connecting the Jetson to a battery regulator, run these acceptance tests: @@ -189,97 +174,6 @@ Before connecting the Jetson to a battery regulator, run these acceptance tests: | Polarity check | Center-positive barrel wiring confirmed with a meter at the plug. | | Branch protection | Jetson branch has its own fuse or protected distribution path. | -The Adafruit product 184 kit is convenient and breadboard-friendly, but it is -based on a low-dropout linear regulator rather than a switching buck converter. -It is not part of the preferred runtime power path. Use it for bench testing or -a low-current isolated peripheral experiment, not in parallel with the Jetson -USB-powered Nano/body rail. - -A conservative first power target: - -| Load | Planning current | -| --------------------------- | --------------------------------------------------------------------- | -| Jetson Orin Nano Super | 7-25 W depending on power mode and workload | -| Nano ESP32 over USB serial | 150-300 mA bursts | -| MAX98357A speaker path | 50-600 mA depending on volume and speaker; keep low on USB power | -| NeoPixel Stick, 8 RGBW LEDs | 100-500 mA depending on brightness/color; cap brightness aggressively | -| ICS-43434 I2S microphone | under 1 mA | - -Cap LED brightness and speaker volume in firmware. A rectangular PP3 9 V -battery is no longer part of the default plan. - -The ICS-43434 microphone needs 3.3 V. The simplest first build can use the Nano -ESP32 `3V3` pin for that low-current microphone rail after the Nano is powered -from Jetson USB. If you use product 184 or another regulator as a separate -3.3 V rail, use it for peripherals only; do not backfeed the Nano `3V3` pin -unless the board documentation explicitly allows it. Route the microphone's -3.3 V through the hard-mute switch or load switch so the mic is physically -unpowered while muted. - -## LED indicator choice - -The selected 3bo LED indicator is the Adafruit NeoPixel Stick with 8 x 5050 -RGBW LEDs in cool white, product 2869. It provides enough pixels for simple -states like idle breathing, wake flash, listening pulse, thinking sweep, and -speaking meter without the current draw of a larger ring or matrix. - -Power the stick from the regulated 5 V rail, tie grounds to the Nano ESP32, and -use a small series resistor on the data line. Product 2869 is RGBW, so firmware -must use a NeoPixel library/configuration that understands four channels per -pixel. - -## Microphone choice - -The selected 3bo speech microphone is the Adafruit ICS-43434 I2S breakout -because it sends digital audio directly over I2S and fits the ESP32-S3 audio -path well. - -The Adafruit MAX9814 electret microphone amplifier, product 1713, is the better -optional analog mic if you want an ADC signal for quick sound-level tests or -sound-reactive LED experiments, because its automatic gain control handles -changing volume better. - -The Adafruit MAX4466 electret microphone amplifier, product 1063, is another -optional analog mic with manually adjustable gain. Either analog module can help -compare analog and digital microphone behavior, but neither should replace the -I2S microphone for the main wake-word pipeline unless the firmware is redesigned -around analog sampling. - -## Speaker amplifier choice - -The selected 3bo speaker amplifier is the Adafruit MAX98357A I2S 3 W class-D -breakout, product 3006. It combines the I2S DAC and mono amplifier stage, so the -Nano ESP32 can stream digital audio to it directly. With a 5 V supply, the -breakout is rated up to 3.2 W into 4 ohm or 1.8 W into 8 ohm at 10% THD. - -For the first build, an 8 ohm speaker is the safer default because it draws less -current and gives the breadboard supply more margin. The Adafruit -breadboard-friendly 8 ohm 0.2 W mini speaker, product 1898, is useful for quiet -bring-up because it plugs into a breadboard or perfboard, but it must be kept at -low volume. Consider a temporary 47-100 ohm series resistor for initial tone -tests. A later 8 ohm 1-3 W speaker will sound better for actual spoken -responses. A 4 ohm speaker can be louder, but it makes the 5 V rail current -budget more important. - -## Jetson brain choice - -The selected local brain is the NVIDIA Jetson Orin Nano Super Developer Kit. -It is overkill for the first LED-and-speaker prototype, in a useful way: it can -run Arbiter locally while also hosting STT/TTS and later vision or richer robot -behaviors. - -Recommended first Jetson stack: - -- Ubuntu/JetPack on the Jetson. -- `arbiter --api` bound to `127.0.0.1:8080`. -- A 3bo bridge service bound to the LAN, for example `0.0.0.0:8081`, with - per-device shared-secret authentication and request rate limits. -- `whisper.cpp` or another local STT runtime using small models first. -- Local TTS such as Piper first, cloud TTS as an optional quality upgrade. - -Keep model files, provider keys, Arbiter tenant tokens, and conversation logs -on the Jetson, not on the Nano ESP32. - ## Reference links - Arduino Nano ESP32 product/spec page: https://store.arduino.cc/products/nano-esp32 diff --git a/examples/3bo/JETSON.md b/examples/3bo/JETSON.md index d221d9c..95cb680 100644 --- a/examples/3bo/JETSON.md +++ b/examples/3bo/JETSON.md @@ -375,20 +375,21 @@ THREEBO_CONVERSATION_FILE=/etc/3bo/conversation.json # Agent routing (defaults match what we create below) # THREEBO_LOCAL_AGENT=local # THREEBO_CLOUD_AGENT=index + +# Hardware event forwarding — bridge relays firmware state events to Arbiter. +# Set to 0 to disable if you don't register the threebo-monitor agent. +# BRIDGE_EVENTS_ENABLED=1 EOF sudo chmod 600 /etc/3bo/bridge.env ``` -### Register the local Arbiter agent +### Register the Arbiter agents -The `local` agent sends simple queries to Ollama instead of the cloud. Create -it once with the admin API. Arbiter must be running: +Arbiter must be running for these commands. -```sh -# Load the admin token -ARBITER_ADMIN_TOKEN=$(cat ~/.arbiter/admin_token) +**Local fast-path agent** — sends simple queries to Ollama instead of the cloud: -# Create the local fast-path agent (stored in ~/.arbiter/tenants.db) +```sh curl -s -X POST http://127.0.0.1:8080/v1/agents \ -H "Authorization: Bearer atr_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \ -H "Content-Type: application/json" \ @@ -406,6 +407,22 @@ The `index` agent (cloud) already exists as the default orchestrator. No additional registration is needed unless you want to override its model or goal. +**Hardware event monitor** — receives device state events from the firmware +via `POST /v1/events` and handles them with a local Ollama model. Matched by +the `event_types` glob patterns in the agent JSON +(`examples/3bo/agents/threebo-monitor.json`): + +```sh +curl -s -X POST http://127.0.0.1:8080/v1/agents \ + -H "Authorization: Bearer atr_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \ + -H "Content-Type: application/json" \ + -d @/path/to/arbiter/examples/3bo/agents/threebo-monitor.json +``` + +The monitor agent handles `device.*` and `audio.*` event types locally (no +cloud call) and logs errors to the agent's `/mem` store. The bridge must have +`BRIDGE_EVENTS_ENABLED=1` (the default) for event forwarding to be active. + ### Run ```sh @@ -450,6 +467,16 @@ curl -s -o /dev/null -w "%{http_code}" \ # 401 ``` +**Hardware event endpoint:** + +```sh +curl -s -X POST http://localhost:8081/v1/event \ + -H "Authorization: Bearer $THREEBO_DEVICE_SECRET" \ + -H "Content-Type: application/json" \ + -d '{"type":"device.mute","data":{}}' +# {"ok":true} +``` + --- ## 7. Systemd services @@ -625,11 +652,13 @@ Run these steps in order. Each depends on the previous. - [ ] Write `/etc/3bo/bridge.env` with all required variables. - [ ] Register the `local` Arbiter agent via `curl POST /v1/agents`. +- [ ] Register the `threebo-monitor` event agent via `curl POST /v1/agents` with `examples/3bo/agents/threebo-monitor.json`. - [ ] Start the bridge: `python3 bridge.py --host 0.0.0.0 --port 8081`. - [ ] Confirm `GET /health` returns `ok`. - [ ] Confirm unauthenticated `POST /v1/utterance` returns `401`. - [ ] Run authenticated `/v1/transcribe` test and confirm transcript. - [ ] Run authenticated `/v1/utterance` test and play back the WAV. +- [ ] Send a test hardware event: `curl -s -X POST .../v1/event -d '{"type":"device.mute","data":{}}'` and confirm `{"ok":true}`. ### Step 7 — hardware loop @@ -651,63 +680,16 @@ Run these steps in order. Each depends on the previous. > Not needed for the v1 voice prototype. Complete Milestones 1–8 first. > Full design specification: [VISION.md](VISION.md). +> Wiring: [CIRCUIT.md](CIRCUIT.md) — PCA9685 and camera sections. +> Parts: [BOM.md](BOM.md) — Vision and motion subsystem table. -### Additional hardware - -The OV5640 camera lives in the robot head and connects to the Arduino Nano -ESP32, not directly to the Jetson. The ESP32-S3 captures frames, JPEG-compresses -them, and forwards them to the Jetson over the existing USB serial link. No -camera hardware connection to the Jetson is required. - -Wire the PCA9685 servo driver to the Jetson 40-pin header: - -| Jetson pin | PCA9685 | -| --- | --- | -| Pin 1 (3.3V) | VCC | -| Pin 2 or 4 (5V) | V+ (servo power) | -| Pin 3 (SDA) | SDA | -| Pin 5 (SCL) | SCL | -| Pin 6 or 9 (GND) | GND | - -Confirm I2C: `sudo i2cdetect -y 1` should show the PCA9685 at address `0x40`. - -Confirm camera: `ls /dev/video*` after connecting, or run -`nvgstcapture-1.0` for a live preview. - -### Additional Python dependencies +Additional dependencies when Milestone 5 begins: ```sh pip3 install mediapipe opencv-python smbus2 pyserial -``` - -`pyserial` is required for the vision service to read JPEG frames from the -ESP32-S3 over the USB serial port. - -### Additional Ollama model - -```sh ollama pull moondream ``` -moondream (~1.6B) handles visual queries ("what do you see?"). It runs -alongside `gemma3:4b` within the Jetson's 8 GB. - -### Bring-up checklist — Milestone 5 - -- [ ] Confirm `i2cdetect -y 1` shows PCA9685 at 0x40. -- [ ] Flash ESP32-S3 firmware with OV5640 camera capture and `frame` USB serial message support. -- [ ] Confirm Jetson sees JPEG frames arriving on the USB serial port from the ESP32-S3. -- [ ] Install Python deps: `mediapipe opencv-python smbus2 pyserial`. -- [ ] Pull moondream: `ollama pull moondream`. -- [ ] Run `vision_service.py`; confirm `GET /health` responds. -- [ ] Confirm `GET /face` returns a centroid when a face is in frame. -- [ ] Confirm `POST /track {"enabled": true}` moves servos toward face. -- [ ] Confirm `POST /rest` returns head to (0°, −5°). -- [ ] Run a visual query through the bridge: ask "what do you see?" and - confirm the response references the scene. -- [ ] Install `3bo-vision.service` unit and enable it alongside the other - services. - --- ## Source references diff --git a/examples/3bo/README.md b/examples/3bo/README.md index ce40304..e3e2a3f 100644 --- a/examples/3bo/README.md +++ b/examples/3bo/README.md @@ -21,6 +21,7 @@ for the M5 upgrade path. See [VISION.md](VISION.md) for the full design. | `FIRMWARE.md` | Arduino firmware architecture, state machine, and bridge protocol | | `JETSON.md` | Step-by-step Jetson setup: Arbiter, Ollama, whisper.cpp, Piper, bridge | | `VISION.md` | M5 design spec: USB camera, pan/tilt neck, face tracking, VLM queries | +| `agents/` | Example Arbiter agent constitutions (event monitor) | | `bridge/` | 3bo bridge launcher and hardware bring-up stub | | `firmware/arduino/` | Arduino bench firmware sketch | diff --git a/examples/3bo/VISION.md b/examples/3bo/VISION.md index 5e1f93b..cc43ab4 100644 --- a/examples/3bo/VISION.md +++ b/examples/3bo/VISION.md @@ -1,488 +1,101 @@ # 3bo Vision and Head-Tracking System -> **MILESTONE 5 — FUTURE DESIGN ONLY** +> **MILESTONE 5 — FUTURE DESIGN** > Nothing in this document is implemented in v1. The v1 prototype has no camera, -> no servos, and no PCA9685. This document is a design specification for a -> future milestone. Do not treat any section as actionable guidance until -> Milestone 5 begins. +> no servos, and no PCA9685. Build servo pockets into the v1 neck bracket so +> Milestone 5 hardware can be installed without a mechanical rebuild. --- ## Overview -This document describes the planned camera and head-tracking system for 3bo. - -3bo detects a human face on wake and orients toward the user throughout the conversation. The neck uses a three-arm differential mechanism: a passive ball-joint pivot at the rear of the head and two servo-driven push/pull rods at the front. Both servos together control pitch (nod); differential servo motion controls roll (head cock). All horizontal tracking is handled by the motorised base. The head camera enables visual queries — the user can ask what 3bo sees and the response draws on a live frame passed through a local VLM. - -The system adds a USB webcam on the head tier, a PCA9685 I2C PWM driver on the Jetson, two MG90S micro-servos driving push/pull rods from the neck base to the head, an N20 gearmotor with encoder for base rotation, and a slip ring at the base joint for continuous 360° rotation. -A background vision service runs on the Jetson and exposes a small localhost -HTTP API consumed by the bridge. +3bo detects a human face on wake and orients toward the user throughout the +conversation. All horizontal tracking is handled by the motorised base. Vertical +tracking uses a three-arm differential neck: one passive rear ball-joint and two +servo-driven push/pull rods at the front. The head camera also enables visual +queries — the user can ask what 3bo sees and the response draws on a live frame +passed through a local VLM (moondream2 via Ollama). --- -## Hardware (Planned) - -### Component Table +## Additional Hardware | Component | Part | Notes | |---|---|---| -| Camera | Adafruit OV5640 Camera Breakout — 72° Lens with Autofocus, product 5945 | Mounted in robot head on custom carrier board. 8-bit parallel DVP to ESP32-S3 camera peripheral. ESP32-S3 JPEG-compresses frames and forwards them to Jetson over USB serial. | -| PWM driver | PCA9685 16-channel I2C servo driver | I2C address 0x40; connected to Jetson 40-pin header I2C bus | -| Servo L | MG90S metal-gear micro servo | PCA9685 channel 0. Left push/pull rod. | -| Servo R | MG90S metal-gear micro servo | PCA9685 channel 1. Right push/pull rod. | -| Base motor | N20 gearmotor with quadrature encoder, 6 V, 100–200 RPM | DRV8833 H-bridge driver; IN1/IN2 from PCA9685 channels 2/3. Encoder A/B to Jetson GPIO. | -| Base bearing | Lazy Susan ball bearing, 100–150 mm | Supports full body weight through 360° rotation | +| Camera | Adafruit OV5640 Camera Breakout — 72° Lens with Autofocus, product 5945 | Mounted in robot head. ESP32-S3 JPEG-compresses frames and forwards to Jetson over USB serial. | +| PWM driver | Adafruit PCA9685 16-channel servo driver, product 815 | I2C 0x40; connected to Jetson 40-pin header | +| Servo L | MG90S metal-gear micro servo | PCA9685 ch 0. Left push/pull rod. | +| Servo R | MG90S metal-gear micro servo | PCA9685 ch 1. Right push/pull rod. | +| Base motor | N20 6 V gearmotor with quadrature encoder, 100–200 RPM | DRV8833 H-bridge; IN1/IN2 from PCA9685 ch 2/3 | +| Base bearing | Lazy Susan bearing, 100–150 mm | Supports full body weight through 360° rotation | | Slip ring | 12-wire capsule slip ring, ≥ 2 A/circuit | Passes 19 V supply, 5 V body rail, motor control, and encoder signals through the rotating base joint | -### Camera Specifications - -| Property | Value | -|---|---| -| Sensor | OV5640, 5 MP | -| Interface | 8-bit parallel DVP to ESP32-S3 camera peripheral; I2C (SCCB) for autofocus control | -| Resolution | VGA (640×480) or higher via `esp32-camera`; JPEG output | -| Horizontal FOV | 72° (non-distorting lens) | -| XCLK | Internal 24 MHz oscillator on breakout (enable via jumper) | -| Capture pipeline | `esp32-camera` on ESP32-S3; JPEG frames forwarded to Jetson over USB serial | - -### PCA9685 Wiring (Planned) - -| Signal | Jetson 40-pin header pins | -|---|---| -| I2C SDA | Pin 3 | -| I2C SCL | Pin 5 | -| VCC (logic) | Pin 1 (3.3 V) | -| GND | Any GND pin | -| V+ (servo power) | Pins 2 or 4 (5 V, shared rail — see Power section) | - -### Servo PWM Parameters - -| Parameter | Value | -|---|---| -| PWM frequency | 50 Hz | -| Minimum pulse width | 500 µs | -| Maximum pulse width | 2400 µs | -| Channel 0 | Servo L (left push/pull rod) | -| Channel 1 | Servo R (right push/pull rod) | - -### Neck Mechanism - -Three-arm differential design. The head is connected to the neck at three -points: one passive rear pivot and two servo-driven push/pull rods at the -front. - -**Connection points** - -| Point | Type | Position | Notes | -|---|---|---|---| -| Back pivot | Ball joint (M3 rod-end or printed socket) | Rear centre of head, at head CG height | Passive — provides the reaction point. Ball joint allows small compliance to prevent binding during combined pitch+roll. | -| Left rod | M3 threaded rod with ball-link ends | Front-left of head, 35 mm left of centreline | Driven by Servo L. | -| Right rod | M3 threaded rod with ball-link ends | Front-right of head, 35 mm right of centreline | Driven by Servo R. | - -**Geometry constraints** - -| Dimension | Value | Notes | -|---|---|---| -| Rod attachment width | 35 mm (centre-to-centre) | Narrower = more pitch authority relative to roll. Calibrate after first print. | -| Back pivot height | At head CG | Head CG must be measured with all components installed. | -| Rod angle at neutral | ~perpendicular to head front face | Maximises mechanical advantage at the midpoint of travel. | -| Servo horn radius | 15 mm (starting point) | Adjust to tune travel range vs. torque. | -| Hard stops | ±32° pitch, ±17° roll | 2° mechanical margin beyond software limits. | - -**Servo command mixing** - -Pitch (nod) and roll (head-cock) are computed from the two servo positions: - -``` -pitch = (servo_L + servo_R) / 2 -roll = (servo_L - servo_R) / 2 -``` - -To command a desired pitch and roll: - -``` -servo_L = pitch_cmd + roll_cmd -servo_R = pitch_cmd - roll_cmd -``` - -Both servo outputs are clamped to hardware travel limits before being written -to the PCA9685. During normal tracking, `roll_cmd = 0` and both servos move -identically. - -**Servo mounting** - -Both servos mount at the base of the neck (body side), not inside the head. -This keeps the head's moment of inertia low for faster PID response. The -push/pull rods run up through or alongside the neck tube to the head -attachment points. +See [CIRCUIT.md](CIRCUIT.md) for wiring details and [BOM.md](BOM.md) for the full vision subsystem parts list. --- -## Range of Motion (Planned) - -### Angle Limits - -| Axis | Actuator | Command | Range | Hard stops | -|---|---|---|---|---| -| Pitch (head nod) | Servo L + Servo R together | `pitch_cmd` | ±30° | Yes, mechanical at ±32° | -| Roll (head cock) | Servo L vs Servo R differential | `roll_cmd` | ±15° | Yes, mechanical at ±17° | -| Yaw (base) | N20 + DRV8833 | base yaw command | 360° continuous | None — encoder-tracked in software | - -The head has no pan axis. All horizontal tracking is handled by base yaw. -Positive pitch = head tips up. Positive roll = head cocks right. Positive yaw = -clockwise viewed from above. Pitch and roll are clamped in software before -servo mixing; hard stops are a backup. - -### Named Positions - -| Position | Yaw | Pitch | Roll | Description | -|---|---|---|---|---| -| `home` | current | 0° | 0° | Head level, centred. Base holds position. | -| `rest` | current | -5° | 0° | Slight downward pitch toward seated user. Default between conversations. | -| `scan_start` | -45° | -5° | 0° | Base yaw at left edge of scan sweep. Head at rest pitch. | -| `scan_end` | +45° | -5° | 0° | Base yaw at right edge of scan sweep. Head at rest pitch. | +## Neck Mechanism -Base yaw is not reset on idle — it holds the last oriented position. +Three-arm differential: one passive rear pivot (ball joint at head CG height) +and two servo-driven push/pull rods at the front (35 mm apart). -### Scan Pattern +Servo mixing: +- Both servos together → pitch (nod, ±30°) +- Differential servo motion → roll (head-cock, ±15°) +- All horizontal tracking → base yaw (N20 motor, 360° continuous) -On wake the base sweeps from -45° to +45° yaw at ~20°/s while the head holds -rest pitch (-5° pitch, 0° roll). The sweep aborts as soon as a face is detected -or after 2.5 s. If no face is found within the cap, listening begins at whatever -yaw position the sweep reached. - -The head servos do not move during the scan sweep — only the base rotates. +Both servos mount at the neck base, not in the head, to keep rotational inertia +low. Rods run up through the neck tube to the head attachment points. --- -## Vision Service Design (Planned) - -> This section describes the intended design of `vision_service.py`, a -> background process that will run on the Jetson. No code is written yet. - -### Responsibilities - -The vision service will: +## Vision Service -1. Read incoming JPEG frames from the USB serial port (forwarded by the ESP32-S3 from the OV5640). -2. Run MediaPipe FaceDetector (full-range model, `model_selection=1`) on each - frame at approximately 20–30 fps. -3. Maintain the current tracking state (face centroid, confidence, pitch, roll, - yaw, servo L/R angles). -4. Run a PID position-control loop to convert face centroid error into pitch - and yaw commands; apply servo mixing for servo L and servo R. -5. Write servo L/R pulse widths to the PCA9685 over I2C; write base yaw - commands to the DRV8833 via PCA9685 channels 2–3. -6. Serve a small localhost HTTP API so the bridge can query state and issue - control commands. +A background process (`vision_service.py`, not yet written) will run on the +Jetson and expose a small localhost HTTP API. -### PID Control Loop Design - -The face centroid is expressed in normalized image coordinates where (0.5, 0.5) -is the center of the frame. - -| Variable | Definition | +| Endpoint | Description | |---|---| -| `error_x` | `centroid_x − 0.5` (positive = face is right of centre) | -| `error_y` | `centroid_y − 0.5` (positive = face is below centre) | -| `yaw_correction` | `PID(error_x) × fov_h` — sent to base yaw motor | -| `pitch_correction` | `PID(error_y) × fov_v` — applied via servo mixing | - -`fov_h` and `fov_v` are the camera's horizontal and vertical field of view in -degrees; measure per chosen webcam model. - -Servo mixing applies pitch correction with roll held at zero during tracking: - -``` -servo_L_cmd = pitch_correction + 0 (roll = 0 during tracking) -servo_R_cmd = pitch_correction - 0 -``` - -All commanded values are clamped to hardware limits before output. When face -detection confidence falls below threshold or no face is present, the PID -integrators are frozen and all actuators hold their last commanded position. - -### Tracking Strategy - -The head has no pan axis. Horizontal and vertical tracking use separate -actuators with no interaction between loops: - -| Axis | Actuator | PID input | Speed | -|---|---|---|---| -| Horizontal | Base yaw (N20 motor) | `error_x` | ~20–40°/s | -| Vertical | Head pitch (differential servo) | `error_y` | ~60°/s max slew | -| Roll | Differential servo | Not used during tracking (roll = 0) | — | - -Because the base handles all horizontal correction and the head handles all -vertical correction, there is no two-stage interaction or cross-axis dependency -to manage. Each PID loop is independent. - -### Localhost HTTP API - -The vision service will expose the following endpoints on localhost (port TBD): - -#### GET /face - -Returns current face tracking state. - -| Field | Type | Description | -|---|---|---| -| `x` | float | Normalized face centroid X (0.0–1.0) | -| `y` | float | Normalized face centroid Y (0.0–1.0) | -| `confidence` | float | Face detection confidence (0.0–1.0) | -| `pitch_deg` | float | Current head pitch command in degrees | -| `roll_deg` | float | Current head roll command in degrees | -| `yaw_deg` | float | Current base yaw position in degrees (encoder-derived) | -| `servo_l_deg` | float | Current Servo L pulse position in degrees | -| `servo_r_deg` | float | Current Servo R pulse position in degrees | - -Example: `{"x":0.52,"y":0.41,"confidence":0.94,"pitch_deg":-4.1,"roll_deg":0.0,"yaw_deg":12.3,"servo_l_deg":-4.1,"servo_r_deg":-4.1}` - -#### GET /frame - -Returns the latest JPEG frame received from the ESP32-S3. Used by the bridge when a -visual query is needed. - -Response: `image/jpeg` binary body. - -#### POST /track - -Enables or disables servo output from the PID loop. - -Request body: - -| Field | Type | Description | -|---|---|---| -| `enabled` | bool | `true` to start tracking, `false` to hold position | - -When disabled, the servos hold the last commanded position. The `home` and -`rest` commands below work regardless of tracking state. - -#### POST /home +| `GET /face` | Current face centroid, confidence, and servo positions | +| `GET /frame` | Latest JPEG from the OV5640 | +| `POST /track` | Enable or disable PID servo tracking | +| `POST /home` | Drive head to level neutral position | +| `POST /rest` | Drive head to rest pose (−5° pitch, 0° roll) | +| `GET /health` | Liveness check | -Drives head to pitch=0°, roll=0° (servo_L=0°, servo_R=0°) immediately. -Ignores tracking state. - -No request body required. - -#### POST /rest - -Drives head to pitch=-5°, roll=0° (servo_L=-5°, servo_R=-5°) immediately. -Ignores tracking state. - -No request body required. - -#### GET /health - -Liveness check. Returns 200 OK if the capture pipeline is running and the -PCA9685 is reachable. +Face detection uses MediaPipe FaceDetector. The PID loop converts centroid +error to pitch (servos) and yaw (base motor) commands. The head has no pan +axis — horizontal tracking is base yaw only. --- -## Robot State Additions (Planned) - -Two new state concepts are planned for Milestone 5. They extend the existing -state table in `README.md` without replacing it. - -### New State: `scanning` +## Visual Queries -| Property | Value | -|---|---| -| Trigger | Wake event received | -| Behavior | Base yaw sweeps from -45° to +45° at ~20°/s; head holds rest pitch (-5°, roll 0°) | -| Transition out | Face locked (→ `listening` + tracking active) or 2.5 s timeout (→ `listening`, no lock) | -| LED | Same as `wake_detected` → `listening` — no additional LED pattern needed | -| Duration cap | 2.5 s | - -The scanning state runs during the transition from wake detection to listening. -If a face is found within the cap, tracking activates and the conversation -proceeds normally. If no face is found within 2.5 s, listening begins anyway -with the head at whatever position the sweep reached. - -### New Mode Flag: `tracking` - -Tracking is not a standalone state — it is a concurrent mode flag that can be -active during `listening`, `thinking`, and `speaking` states. +When the user asks what 3bo sees ("what's in front of you", "describe this", +"what is that"): -| Property | Value | -|---|---| -| Activated | When a face is locked during scanning | -| Deactivated | When the robot returns to idle | -| Effect | PID loop drives servos each frame to keep face centered | -| LED | No change — underlying conversation state LEDs remain in effect | - -Because tracking is a background mode flag rather than a foreground state, no -new LED pattern is needed for it. The head simply moves while the existing -conversation LED patterns play. +1. Bridge calls `GET /frame` for the latest JPEG. +2. Frame is passed to `moondream` via Ollama (`ollama pull moondream`). +3. Prompt: *"Describe what you see concisely in two sentences."* +4. Description is prepended to the Arbiter message as `[Visual context: ...]`. +5. Query routes to the cloud agent regardless of the complexity classifier. --- -## Bridge Integration (Planned) - -> This section describes planned changes to -> `examples/3bo/bridge/bridge.py` and the generic bridge at -> `examples/voice-bridge/bridge.py`. No code is written yet. - -### Vision Service Base URL +## On Wake -The bridge will read a `THREEBO_VISION_URL` environment variable (default: -`http://127.0.0.1:PORT`). All vision API calls go to that base. +When the bridge receives `audio.wake_detected`: -### Wake Event Handler (Planned) - -When the bridge receives a wake event: - -1. Call `POST /vision/track` with `{"enabled": true}`. -2. Begin the servo scan sweep by driving servos to `scan_start` position and - issuing incremental angle commands toward `scan_end` at 20°/s. -3. Poll `GET /vision/face` each sweep step. If `confidence` exceeds a threshold - (TBD, e.g. 0.85), stop sweep and let PID loop take over. -4. After lock or 2.5 s timeout, transition to `listening`. - -### Idle Return Handler (Planned) - -When the bridge returns to idle after a conversation: - -1. Call `POST /vision/rest` to return the head to the rest pose. -2. Call `POST /vision/track` with `{"enabled": false}`. - -### Visual Keyword Detection (Planned) - -Before forwarding a transcript to Arbiter, the bridge will run a -`needs_vision(transcript)` check. - -#### Trigger Keywords - -| Keyword or phrase | -|---| -| see | -| seeing | -| look | -| looking | -| show | -| in front of you | -| around you | -| what is that | -| describe | -| notice | - -#### Visual Query Pipeline - -When `needs_vision` returns true: - -1. Call `GET /vision/frame` to retrieve the latest JPEG. -2. Base64-encode the frame. -3. Call the Ollama `/api/generate` endpoint with model `moondream` and the - encoded frame as the image input. -4. Use the prompt: `"Describe what you see concisely in two sentences."` -5. Prepend the model's response to the Arbiter message as: - `[Visual context: ]` -6. Route the message to the cloud agent regardless of complexity classification - (visual context requires a full model response; the local fast-path agent - should not receive image-derived context). - -#### moondream2 Model Details - -| Property | Value | -|---|---| -| Model | moondream2 | -| Size | ~1.6 B parameters | -| Ollama name | `moondream` | -| Pull command | `ollama pull moondream` | -| API | Standard Ollama `/api/generate` with base64 image field | -| Prompt | "Describe what you see concisely in two sentences." | - ---- - -## Latency Notes (Planned) - -These are expected latency figures based on hardware specifications. Actual -values will need to be measured during Milestone 5 integration. - -| Operation | Expected latency | -|---|---| -| Face detection loop | ~33 ms per frame at ~30 fps | -| PID update | Each frame (~33 ms interval) | -| moondream2 frame query on Jetson Orin Nano | ~1.5–3 s | -| MG90S servo slew (rest to typical face angle ~30°) | ~50 ms | -| Scan to face lock (face within ±45°, 20°/s sweep) | 0–4.5 s | -| Scan timeout fallback | 2.5 s cap | - -The 2.5 s scan cap keeps wake-to-listening latency bounded at a level that -feels acceptable even when no face is present. The VLM query latency (1.5–3 s) -is additive to the normal STT and Arbiter latency for visual queries; that -budget should be communicated to users if possible (e.g. an extended thinking -LED phase). - ---- - -## Power Notes (Planned) - -### Servo Current Budget - -| Condition | Current per servo | Total (2 servos) | -|---|---|---| -| Idle / holding position | ~50 mA | ~100 mA | -| Active movement | ~150 mA (typical) | ~300 mA | -| Stall (hard limit) | ~250 mA | ~500 mA | - -### Supply Rail Plan - -| Rail | Source | Load | -|---|---|---| -| Servo V+ | Jetson 40-pin 5 V (pins 2 and 4) | PCA9685 V+ → Servo L, Servo R | -| Motor VM | Same 5 V rail → DRV8833 VM | N20 base motor (100–300 mA typical) | -| PCA9685 logic VCC | Jetson 40-pin 3.3 V (pin 1) | PCA9685 logic only | -| Encoder VCC | Base 3.3 V rail | N20 encoder logic | -| LED and audio 5 V | Existing USB/Nano body rail | NeoPixel, MAX98357A amp | - -Peak draw with both servos slewing and base motor running: ~700 mA on the 5 V -rail. The Jetson 40-pin 5 V header is rated ~3 A — sufficient with margin. -Add bulk capacitance (220–470 µF) near both the PCA9685 V+ terminal and the -DRV8833 VM pin to absorb simultaneous inrush from servos and motor start. - -The Jetson 40-pin 5 V rail (pins 2 and 4) can supply up to approximately 3 A, -which comfortably covers two MG90S servos at peak draw with margin remaining -for other 5 V peripherals. - -### Rail Isolation - -Keep servo V+ on the PCA9685 board separate from the logic body rail serving -LEDs and audio. Servo PWM noise and inrush current during slew should not -affect the audio amplifier or NeoPixel power path. Add bulk capacitance (100– -470 µF) near the PCA9685 V+ terminal to absorb slew inrush. - -Do not exceed the Jetson header 5 V current rating. If a future build adds more -servos, a dedicated 5 V regulator fed from the main battery rail is the -recommended path — not additional draws on the header pins. +1. Base sweeps from −45° to +45° yaw at ~20°/s (scan state). +2. If face detected (confidence > 0.85), PID tracking activates. +3. After lock or 2.5 s timeout, transition to listening. +4. On idle return: call `/rest`, disable tracking. --- ## Milestone Placement -| Milestone | Status | Scope | -|---|---|---| -| Milestone 1 | Planned | Software loop: Jetson, Arbiter, STT, TTS, bridge | -| Milestone 2 | Planned | Audio and LED loop: I2S mic, speaker, NeoPixel | -| Milestone 3 | Planned | Wake word: on-device detection, VAD, mute switch | -| Milestone 4 | Planned | Product hardening: pairing, persistence, OTA, recovery | -| **Milestone 5** | **This document** | **Vision and head-tracking: camera, servos, VLM queries** | - -### v1 Prototype Constraints - -The v1 prototype (Milestones 1–4) has no camera, no PCA9685, and no servos. -None of the hardware described in this document is installed in v1. - -The neck bracket design should include servo pocket cutouts and mounting holes -sized for MG90S servos so that Milestone 5 hardware can be installed without -a major mechanical rebuild. Pockets should be left empty in v1, with the servo -wire channels sealed against debris. - -The vision service (`vision_service.py`) does not run in v1. Its dependencies -(MediaPipe, OpenCV, `smbus2`, PCA9685 library) are not installed in v1. - ---- - -*Last updated: 2026-06-12. This is a design document. All content describes -planned future work for Milestone 5 and does not reflect the current state of -the 3bo prototype.* +| Milestone | Scope | +|---|---| +| 1–4 | Voice prototype (current) — no camera, servos, or PCA9685 | +| **5** | **This document — camera, head-tracking, VLM visual queries** | diff --git a/examples/3bo/agents/3bo-events.json b/examples/3bo/agents/3bo-events.json new file mode 100644 index 0000000..0d3fdf0 --- /dev/null +++ b/examples/3bo/agents/3bo-events.json @@ -0,0 +1,18 @@ +{ + "name": "3bo-events", + "role": "device-monitor", + "model": "ollama/gemma3:4b", + "max_tokens": 64, + "temperature": 0.1, + "goal": "Monitor hardware state transitions from the 3bo robot. Acknowledge events in one sentence. Log unusual conditions.", + "rules": [ + "Respond in one sentence or less — no preamble.", + "For device.boot: acknowledge startup with the reported IP.", + "For device.mute: note that the microphone is now muted.", + "For device.unmute: note that the microphone is active again.", + "For device.wifi_reconnect: acknowledge reconnection.", + "For device.audio_error: flag the stage that failed clearly.", + "For audio.wake_detected: confirm wake was received.", + "For audio.utterance_discarded: note the discard reason if present." + ] +} diff --git a/examples/3bo/agents/3bo-voice.json b/examples/3bo/agents/3bo-voice.json new file mode 100644 index 0000000..4a936ca --- /dev/null +++ b/examples/3bo/agents/3bo-voice.json @@ -0,0 +1,16 @@ +{ + "name": "3bo-voice", + "role": "voice-assistant", + "model": "claude-sonnet-4-latest", + "max_tokens": 256, + "temperature": 0.4, + "capabilities": ["/parallel", "/fetch", "/mem"], + "goal": "Answer voice queries for the 3bo robot. Responses are spoken aloud — keep them conversational, warm, and brief. For queries that benefit from multiple sources (current events, weather, multi-part questions), use /parallel to gather data concurrently before synthesising.", + "rules": [ + "Respond in two or three sentences maximum.", + "No markdown, lists, or formatting — output is synthesised to speech.", + "For multi-source queries, use /parallel to fetch sources concurrently rather than sequentially.", + "For simple factual questions, answer directly without tools.", + "Never start a response with 'I' as the first word." + ] +} diff --git a/examples/3bo/agents/threebo-monitor.json b/examples/3bo/agents/threebo-monitor.json new file mode 100644 index 0000000..c89c04d --- /dev/null +++ b/examples/3bo/agents/threebo-monitor.json @@ -0,0 +1,26 @@ +{ + "id": "threebo-monitor", + "name": "3bo Hardware Monitor", + "role": "hardware-monitor", + "model": "ollama/gemma3:4b", + "max_tokens": 192, + "temperature": 0.1, + "goal": "Monitor 3bo device hardware events. Log state transitions (mute, error, reconnect), identify error patterns, and flag when the device needs operator attention. Respond in one sentence. Do not generate audio — these turns are for logging and alerting only.", + "capabilities": ["/mem", "/todo"], + "event_types": [ + "device.*", + "audio.*" + ], + "rules": [ + "Respond in a single short sentence summarising what happened.", + "For device.error events, store a /mem entry with the source and timestamp; if the same source has errored more than twice today, also create a /todo flagging it for operator review.", + "For device.mute events, acknowledge without action.", + "For device.unmute events, acknowledge without action.", + "For device.wifi_reconnect events, acknowledge the reconnection.", + "For device.conversation.reset events, acknowledge that memory was cleared.", + "For audio.wake_detected events, acknowledge without action.", + "For audio.utterance_discarded events, note the reason if present.", + "For audio.turn_complete events, note the tier (local or cloud) and whether the turn produced output.", + "Never generate speech text — these events bypass the TTS pipeline." + ] +} diff --git a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h index 6dc089c..f6ace9a 100644 --- a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h +++ b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_config.example.h @@ -22,7 +22,14 @@ constexpr bool THREEBO_ENABLE_ENERGY_WAKE = false; constexpr int32_t THREEBO_ENERGY_WAKE_THRESHOLD = 1200; // Keep first tests gentle for the 0.2 W speaker and 8-pixel LED stick. -constexpr uint8_t THREEBO_LED_BRIGHTNESS = 28; -constexpr uint8_t THREEBO_RECORD_SECONDS = 4; -constexpr size_t THREEBO_MAX_RESPONSE_WAV_BYTES = 512 * 1024; -constexpr uint32_t THREEBO_HTTP_TIMEOUT_MS = 30000; +constexpr uint8_t THREEBO_LED_BRIGHTNESS = 28; +constexpr uint8_t THREEBO_RECORD_SECONDS = 4; +constexpr size_t THREEBO_MAX_RESPONSE_WAV_BYTES = 512 * 1024; +constexpr uint32_t THREEBO_HTTP_TIMEOUT_MS = 30000; + +// Hardware event reporting. When enabled, the firmware POSTs state transitions +// (mute, error, Wi-Fi reconnect) to the Jetson bridge as JSON events, which +// the bridge forwards to Arbiter POST /v1/events for agent-driven handling. +// Events are fire-and-forget; keep the timeout short since the bridge is LAN-local. +constexpr bool THREEBO_ENABLE_EVENTS = true; +constexpr uint32_t THREEBO_EVENT_TIMEOUT_MS = 1000; diff --git a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino index c4c07fc..f7c2c57 100644 --- a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino +++ b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino @@ -43,6 +43,8 @@ uint32_t state_started_ms = 0; uint32_t last_wifi_attempt_ms = 0; uint32_t last_energy_wake_ms = 0; bool audio_rx_ready = false; +bool boot_event_pending = true; // emit device.boot on first WiFi connect +bool prev_muted_state = false; // edge-detect mute transitions size_t min_size(size_t a, size_t b) { return a < b ? a : b; @@ -309,6 +311,35 @@ bool wake_detected() { return serial_wake_requested() || energy_wake_detected(); } +// POST a JSON event to the bridge POST /v1/event endpoint. +// Fire-and-forget: the bridge relays it to Arbiter POST /v1/events. +// No-op when THREEBO_ENABLE_EVENTS is false or Wi-Fi is not connected. +// data_json must be a valid JSON object literal (e.g. "{}"). +void send_event(const char *type, const char *data_json = "{}") { + if (!THREEBO_ENABLE_EVENTS) return; + if (WiFi.status() != WL_CONNECTED) return; + + WiFiClient client; + HTTPClient http; + const String url = String(THREEBO_BRIDGE_BASE_URL) + "/v1/event"; + + if (!http.begin(client, url)) return; + http.setTimeout(THREEBO_EVENT_TIMEOUT_MS); + http.addHeader("Content-Type", "application/json"); + http.addHeader("Authorization", String("Bearer ") + THREEBO_DEVICE_SECRET); + + String body = "{\"type\":\""; + body += type; + body += "\",\"data\":"; + body += data_json; + body += "}"; + + http.POST(body); + http.end(); + Serial.print("event="); + Serial.println(type); +} + uint8_t *record_utterance_wav(size_t *out_len) { *out_len = 0; if (!audio_rx_ready && !begin_audio_rx()) return nullptr; @@ -462,6 +493,8 @@ void handle_turn() { if (is_muted()) return; + send_event("audio.wake_detected", "{}"); + set_state(RobotState::Listening); size_t wav_len = 0; uint8_t *wav = record_utterance_wav(&wav_len); @@ -472,11 +505,13 @@ void handle_turn() { if (!wav || wav_len <= WAV_HEADER_BYTES || is_muted()) { if (wav) free(wav); Serial.println("utterance discarded"); + send_event("audio.utterance_discarded", "{}"); return; } const bool ok = upload_utterance_and_play_response(wav, wav_len); if (!ok) { + send_event("device.error", "{\"source\":\"bridge\"}"); set_state(RobotState::Error); const uint32_t error_started = millis(); while (millis() - error_started < ERROR_HOLD_MS) { @@ -508,6 +543,7 @@ void setup() { connect_wifi(); if (!begin_audio_rx()) { set_state(RobotState::Error); + send_event("device.audio_error", "{\"stage\":\"init\"}"); } else { set_state(is_muted() ? RobotState::Muted : RobotState::Idle); } @@ -524,6 +560,13 @@ void loop() { } if (state == RobotState::WifiConnecting) { + const String ip = WiFi.localIP().toString(); + if (boot_event_pending) { + boot_event_pending = false; + send_event("device.boot", ("{\"ip\":\"" + ip + "\"}").c_str()); + } else { + send_event("device.wifi_reconnect", ("{\"ip\":\"" + ip + "\"}").c_str()); + } set_state(RobotState::Idle); } @@ -533,7 +576,13 @@ void loop() { return; } - if (is_muted()) { + const bool cur_muted = is_muted(); + if (cur_muted != prev_muted_state) { + prev_muted_state = cur_muted; + send_event(cur_muted ? "device.mute" : "device.unmute", "{}"); + } + + if (cur_muted) { set_state(RobotState::Muted); delay(25); return; diff --git a/examples/voice-bridge/bridge.py b/examples/voice-bridge/bridge.py index c8f9998..aaaea40 100644 --- a/examples/voice-bridge/bridge.py +++ b/examples/voice-bridge/bridge.py @@ -15,9 +15,10 @@ 7. Concatenate PCM → WAV header → return with Content-Length. Endpoints: - POST /v1/utterance WAV upload → WAV response (main device endpoint) - POST /v1/transcribe WAV upload → JSON {"transcript": "..."} (debug/test) - GET /health 200 ok\\n + POST /v1/utterance WAV upload → WAV response (main device endpoint) + POST /v1/transcribe WAV upload → JSON {"transcript": "..."} (debug/test) + POST /v1/event JSON hardware event → forwarded to Arbiter /v1/events + GET /health 200 ok\\n Required environment variables: ARBITER_TOKEN Arbiter bearer token (atr_...) @@ -38,6 +39,7 @@ BRIDGE_CONVERSATION_FILE path to persist conversation state across restarts (e.g. /etc/3bo/conversation.json) if unset, cloud turns are stateless (no memory) + BRIDGE_EVENTS_ENABLED forward device events to Arbiter, default 1 (set to 0 to disable) Memory model: Simple/local queries (arithmetic, time, greetings) are routed stateless to @@ -109,9 +111,10 @@ def _opt(name: str, fallback: str) -> str: PIPER_BIN = _opt("PIPER_BIN", "piper") PIPER_SAMPLE_RATE = int(_opt("PIPER_SAMPLE_RATE", "16000")) BRIDGE_API_KEY = os.environ.get("BRIDGE_API_KEY", "").strip() -BRIDGE_MAX_BYTES = int(_opt("BRIDGE_MAX_BYTES", "524288")) -BRIDGE_RATE_LIMIT = int(_opt("BRIDGE_RATE_LIMIT", "20")) -BRIDGE_CONV_FILE = os.environ.get("BRIDGE_CONVERSATION_FILE", "").strip() +BRIDGE_MAX_BYTES = int(_opt("BRIDGE_MAX_BYTES", "524288")) +BRIDGE_RATE_LIMIT = int(_opt("BRIDGE_RATE_LIMIT", "20")) +BRIDGE_CONV_FILE = os.environ.get("BRIDGE_CONVERSATION_FILE", "").strip() +BRIDGE_EVENTS_ENABLED = os.environ.get("BRIDGE_EVENTS_ENABLED", "1").strip() not in ("0", "false", "off") _parsed = urlparse(ARBITER_URL) ARBITER_HOST = _parsed.hostname or "127.0.0.1" @@ -485,6 +488,62 @@ def _cloud_stream(message: str, idkey: str) -> Iterator[str]: raise RuntimeError("conversation unavailable after two attempts") +# ────────────────────────────────────────────────────────────────────────────── +# Device event forwarding +# ────────────────────────────────────────────────────────────────────────────── + +def _drain_event_stream(conn: http.client.HTTPConnection, + resp: http.client.HTTPResponse) -> None: + """Consume the Arbiter SSE stream for an event turn on a daemon thread.""" + try: + buf = b"" + while len(buf) < 16384: + chunk = resp.read(256) + if not chunk: + break + buf += chunk + if b"event: done" in buf or b"event:done" in buf: + break + except Exception: + pass + finally: + try: + conn.close() + except Exception: + pass + + +def forward_event_to_arbiter(event_type: str, source: str, + payload: dict) -> None: + """POST a device event to Arbiter /v1/events and drain the SSE reply.""" + body = json.dumps({ + "type": event_type, + "source": source, + "payload": payload, + }).encode() + headers = { + "Content-Type": "application/json", + "Content-Length": str(len(body)), + "Authorization": f"Bearer {ARBITER_TOKEN}", + "Accept": "text/event-stream", + } + try: + conn = http.client.HTTPConnection(ARBITER_HOST, ARBITER_PORT, timeout=10) + conn.request("POST", "/v1/events", body=body, headers=headers) + resp = conn.getresponse() + if resp.status != 200: + log.warning("event forward HTTP %d type=%s", resp.status, event_type) + resp.read() + conn.close() + return + log.info("event forwarded type=%s source=%s", event_type, source) + threading.Thread( + target=_drain_event_stream, args=(conn, resp), daemon=True + ).start() + except Exception as exc: + log.warning("event forward failed type=%s: %s", event_type, exc) + + # ────────────────────────────────────────────────────────────────────────────── # Sentence splitting # ────────────────────────────────────────────────────────────────────────────── @@ -538,6 +597,12 @@ def process_utterance( if _conv_mgr is not None: new_id = _conv_mgr.reset() log.info("turn=%s memory reset new_conversation=%d", turn_id, new_id) + if BRIDGE_EVENTS_ENABLED: + threading.Thread( + target=forward_event_to_arbiter, + args=("device.conversation.reset", source, {"turn_id": turn_id}), + daemon=True, + ).start() tier = "cloud" log.info("turn=%s tier=reset->cloud", turn_id) text_stream = _cloud_stream(transcript, turn_id) @@ -579,6 +644,20 @@ def process_utterance( if not total_pcm: return make_error_wav("I don't have a response for that."), transcript + if BRIDGE_EVENTS_ENABLED: + elapsed_ms = int((t_done - t0) * 1000) + threading.Thread( + target=forward_event_to_arbiter, + args=("audio.turn_complete", source, { + "turn_id": turn_id, + "tier": tier, + "transcript_chars": len(transcript), + "pcm_bytes": len(total_pcm), + "elapsed_ms": elapsed_ms, + }), + daemon=True, + ).start() + return make_wav(total_pcm), transcript @@ -594,6 +673,8 @@ def do_POST(self) -> None: self._handle_utterance() elif self.path == "/v1/transcribe": self._handle_transcribe() + elif self.path in ("/v1/event", "/v1/device-event"): + self._handle_device_event() else: self.send_error(404, "not found") @@ -611,6 +692,41 @@ def _handle_utterance(self) -> None: wav, _ = process_utterance(body, source=self.client_address[0], complexity_hint=hint) self._send_wav(wav) + def _handle_device_event(self) -> None: + """Accept a hardware state event from the device and forward to Arbiter.""" + if not self._authorized(): + self.send_error(401, "unauthorized") + return + try: + length = int(self.headers.get("Content-Length") or "0") + except ValueError: + self.send_error(400, "invalid content-length") + return + if length <= 0 or length > 4096: + self.send_error(400, "bad body size") + return + body_bytes = self.rfile.read(length) + try: + event = json.loads(body_bytes) + event_type = str(event.get("type", "")) + # Firmware sends "data"; legacy/direct callers may send "payload". + payload = event.get("data") or event.get("payload") or {} + if not isinstance(payload, dict): + payload = {} + except (json.JSONDecodeError, AttributeError): + self.send_error(400, "invalid json") + return + if not event_type: + self.send_error(400, "missing type") + return + if BRIDGE_EVENTS_ENABLED: + threading.Thread( + target=forward_event_to_arbiter, + args=(event_type, "3bo", payload), + daemon=True, + ).start() + self._send_json(202, b'{"ok":true}') + def _handle_transcribe(self) -> None: """STT-only — useful for debug and latency measurement.""" body = self._read_audio_body() From ef8cc0d119d2efb0fb9e9e623c81e41d360117b1 Mon Sep 17 00:00:00 2001 From: Tyler Reckart Date: Wed, 24 Jun 2026 09:10:37 -0400 Subject: [PATCH 4/4] revise documentation --- examples/3bo/BOM.md | 15 +- examples/3bo/CIRCUIT.md | 30 +- examples/3bo/README.md | 4 +- .../threebo_nano_esp32/threebo_nano_esp32.ino | 455 ++++++++++-------- 4 files changed, 289 insertions(+), 215 deletions(-) diff --git a/examples/3bo/BOM.md b/examples/3bo/BOM.md index 7b37953..2946aee 100644 --- a/examples/3bo/BOM.md +++ b/examples/3bo/BOM.md @@ -25,9 +25,8 @@ the verified USB/body 5 V budget, or add a Jetson-powered USB hub/accessory | 1 | Microcontroller | Arduino Nano ESP32 | 25-35 | Main controller. ESP32-S3, 3.3 V I/O, USB-C. Powered by the Jetson USB host link in the wired build. | | 1 | I2S MEMS microphone | Adafruit ICS-43434 I2S MEMS microphone breakout, product 6049 | 5 | Digital mono microphone. Power from 3.3 V; not for 5 V logic. | | 1 | I2S audio amplifier | Adafruit MAX98357A I2S 3 W class-D amplifier breakout, product 3006 | 6 | Drives the speaker directly from I2S audio. Runs from 2.7-5.5 V and accepts 3.3 V logic. | -| 1 | Breadboard speaker | Adafruit breadboard-friendly PCB mount mini speaker, 8 ohm 0.2 W, product 1898 | 2 | Quiet first-test speaker. Do not overdrive it with the MAX98357A. | -| 1 | Final enclosure speaker | 8 ohm 1-3 W small speaker | 3-8 | Optional upgrade once the audio path works. Better suited to spoken responses than the 0.2 W breadboard speaker. | -| 1 | Addressable LED indicator | Adafruit NeoPixel Stick, 8 x 5050 RGBW cool white, product 2869 | 8 | Main 3bo status indicator. 5 V power, one data pin, RGBW library required. | +| 1 | Speaker | Adafruit 3" diameter 8 ohm 1 W speaker, product 1313 | 4 | Robot eye speaker. 8 ohm, 1 W max. MAX98357A delivers ~1.8 W into 8 ohm on 5 V — keep volume below speaker rating. | +| 1 | Addressable LED indicator | Adafruit NeoPixel Jewel, 7 x 5050 RGBW warm white ~3000K, product 2858 | 6 | Robot eye. 7 pixels: centre (index 0) + 6-pixel outer ring (indices 1–6). 5 V power, one data pin, NEO_GRBW + NEO_KHZ800. | | 1 | USB-C data cable | Short, data-capable USB-C cable from Jetson USB host to Nano ESP32 | 5-15 | Carries power and serial data between Jetson and Nano. Avoid charge-only cables. | | 1 | USB 5 V breakout or measured VBUS access | USB-C breakout, powered USB hub, or carrier-approved 5 V accessory output | varies | Optional only if the NeoPixel/amp need more 5 V current than the Nano exposes safely. Verify current limits before use. | | 1 | 5 V body rail, optional | Jetson-powered USB hub/accessory 5 V rail, current-limited/fused | 15-40 | Use only if speaker/LED tests exceed the safe USB/Nano 5 V budget. Not battery-fed separately. | @@ -120,13 +119,13 @@ For the smallest useful prototype order: - Short data-capable USB-C cable from Jetson to Nano - Adafruit ICS-43434 I2S microphone breakout, product 6049 - Adafruit MAX98357A I2S amplifier breakout, product 3006 -- Adafruit breadboard-friendly 8 ohm 0.2 W mini speaker, product 1898 -- Adafruit NeoPixel Stick 8 x RGBW cool white, product 2869 +- Adafruit 3" 8 ohm 1 W speaker, product 1313 +- Adafruit NeoPixel Jewel 7 x RGBW warm white, product 2858 - Adafruit adjustable breadboard power supply kit, product 184 - DPDT hard-mute switch, or SPST switch plus microphone load-switch circuit - SPST power switch - Microphone power-switch/load-switch parts for hard mute -- 74AHCT/74HCT data level shifter for the NeoPixel stick +- 74AHCT/74HCT data level shifter for the NeoPixel Jewel - 470-1000 uF capacitor - 330-470 ohm resistor - Jumper wires and breadboard/perfboard @@ -179,11 +178,11 @@ Before connecting the Jetson to a battery regulator, run these acceptance tests: - Arduino Nano ESP32 product/spec page: https://store.arduino.cc/products/nano-esp32 - Arduino Nano ESP32 docs: https://docs.arduino.cc/hardware/nano-esp32/ - Adafruit MAX98357A I2S amplifier: https://www.adafruit.com/product/3006 -- Adafruit breadboard-friendly 8 ohm 0.2 W mini speaker: https://www.adafruit.com/product/1898 +- Adafruit 3" 8 ohm 1 W speaker: https://www.adafruit.com/product/1313 - Adafruit ICS-43434 I2S microphone breakout: https://www.adafruit.com/product/6049 - Adafruit MAX9814 electret microphone amplifier with AGC: https://www.adafruit.com/product/1713 - Adafruit MAX4466 electret microphone amplifier: https://www.adafruit.com/product/1063 -- Adafruit NeoPixel Stick 8 x RGBW cool white: https://www.adafruit.com/product/2869 +- Adafruit NeoPixel Jewel 7 x RGBW warm white: https://www.adafruit.com/product/2858 - Adafruit adjustable breadboard power supply kit: https://www.adafruit.com/product/184 - Pololu step-down voltage regulators: https://www.pololu.com/category/131/step-down-buck-voltage-regulators - NVIDIA Jetson Orin Nano Super Developer Kit: https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/nano-super-developer-kit/ diff --git a/examples/3bo/CIRCUIT.md b/examples/3bo/CIRCUIT.md index b9a8d8b..c97f567 100644 --- a/examples/3bo/CIRCUIT.md +++ b/examples/3bo/CIRCUIT.md @@ -126,7 +126,7 @@ enclosure wall. | MAX98357A pin | Connects to | Notes | | --- | --- | --- | -| `VIN` | 5 V rail | Keep volume low with the 0.2 W speaker. | +| `VIN` | 5 V rail | Amp delivers ~1.8 W into 8 Ω on 5 V — stay below the 1 W speaker rating. | | `GND` | GND rail | Common ground. | | `BCLK` | Nano `D2` | Shared I2S bit clock. | | `LRC` / `LRCLK` | Nano `D3` | Shared I2S word select. | @@ -136,17 +136,17 @@ enclosure wall. | `+` speaker output | Speaker `+` | Bridge-tied output. Do not connect to GND. | | `-` speaker output | Speaker `-` | Bridge-tied output. Do not connect to GND. | -The MAX98357A can overpower the 0.2 W speaker. Use the lowest possible software -volume for the first test, and prefer short test tones. For safer bring-up, -place a 47-100 ohm resistor in series with one speaker lead, or use an 8 ohm -1 W speaker. +The MAX98357A delivers approximately 1.8 W into 8 Ω on a 5 V supply. The +selected speaker (product 1313) is rated 1 W, so keep software volume moderate. -### Breadboard speaker, product 1898 +### Speaker, product 1313 + +3" diameter, 8 Ω, 1 W. Four mounting tabs at 60 mm spacing. | Speaker pin | Connects to | Notes | | --- | --- | --- | -| One speaker pin | MAX98357A speaker `+` | Polarity is not critical for a single speaker. | -| Other speaker pin | MAX98357A speaker `-` | Do not connect either speaker pin to GND. | +| One speaker lead | MAX98357A speaker `+` | Polarity is not critical for a single speaker. | +| Other speaker lead | MAX98357A speaker `-` | Do not connect either speaker lead to GND. | ### 74AHCT/74HCT level shifter for NeoPixel data @@ -163,16 +163,20 @@ the logic is the same. | `DIR` | fixed direction, if present | Tie for A-to-Y direction on bidirectional parts. Not present on simple buffers. | | Unused inputs | GND or defined level | Do not leave CMOS inputs floating. | -### NeoPixel Stick RGBW, product 2869 +### NeoPixel Jewel RGBW, product 2858 + +7 pixels: index 0 is the centre LED; indices 1–6 are the outer ring. Firmware +addresses them individually — ring-chase animations rotate through indices 1–6, +with the centre pixel held at a dim accent colour. | NeoPixel pin | Connects to | Notes | | --- | --- | --- | -| `5V` / `+` | 5 V rail | Cap brightness in firmware. | +| `PWR` / `+` | 5 V rail | Cap brightness in firmware. | | `GND` / `-` | GND rail | Common ground. | -| `DIN` | 330-470 ohm resistor from level shifter output | Use the input side of the stick, not `DOUT`. | -| `DOUT` | unconnected | Only used if chaining another NeoPixel module. | +| `DIN` | 330-470 ohm resistor from level shifter output | Use the `DIN` pad, not `DOUT`. | +| `DOUT` | unconnected | Only used if chaining another Jewel. | -Firmware must configure this as RGBW, not RGB. +Firmware type constant: `NEO_GRBW + NEO_KHZ800` (same protocol as the former NeoPixel Stick). ### Hard mute switch diff --git a/examples/3bo/README.md b/examples/3bo/README.md index e3e2a3f..ff9bb74 100644 --- a/examples/3bo/README.md +++ b/examples/3bo/README.md @@ -76,8 +76,8 @@ flowchart LR | Controller | Arduino Nano ESP32 | Wake word, I2S mic/amp, LEDs, mute | | Microphone | Adafruit ICS-43434 I2S MEMS mic breakout, product 6049 | Digital mono speech capture | | Amplifier | Adafruit MAX98357A I2S amplifier, product 3006 | Speaker playback | -| Speaker | Adafruit 8 ohm 0.2 W mini speaker, product 1898 | Bring-up audio output | -| LEDs | Adafruit NeoPixel Stick 8 × RGBW, product 2869 | Robot state display | +| Speaker | Adafruit 3" 8 ohm 1 W speaker, product 1313 | Audio output | +| LEDs | Adafruit NeoPixel Jewel 7 × RGBW, product 2858 | Robot eye / state display | | Mute | DPDT switch + P-channel MOSFET or load switch | Hard microphone power cutoff | Power rules: diff --git a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino index f7c2c57..abc2228 100644 --- a/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino +++ b/examples/3bo/firmware/arduino/threebo_nano_esp32/threebo_nano_esp32.ino @@ -1,29 +1,36 @@ #include #include -#include +#include #include #include #include #include "threebo_config.h" -constexpr int PIN_I2S_BCLK = D2; -constexpr int PIN_I2S_WS = D3; -constexpr int PIN_I2S_MIC = D4; -constexpr int PIN_I2S_AMP = D5; +// Arduino Nano ESP32 pin labels — used for Arduino API (pinMode / digitalRead / +// digitalWrite). With BOARD_HAS_PIN_REMAP these are logical numbers that the +// board's remap layer converts to the real ESP32-S3 GPIO. constexpr int PIN_PIXELS = D6; -constexpr int PIN_MUTE = D7; +constexpr int PIN_MUTE = D7; constexpr int PIN_AMP_SD = D8; +// Raw ESP32-S3 GPIO numbers required by the ESP-IDF I2S driver (bypasses the +// Arduino remap layer). D2=GPIO5, D3=GPIO6, D4=GPIO7, D5=GPIO8. +constexpr int GPIO_I2S_BCLK = 5; // D2 +constexpr int GPIO_I2S_WS = 6; // D3 +constexpr int GPIO_I2S_MIC = 7; // D4 data-in from ICS-43434 +constexpr int GPIO_I2S_AMP = 8; // D5 data-out to MAX98357A + constexpr uint32_t SAMPLE_RATE_HZ = 16000; -constexpr uint16_t PIXEL_COUNT = 8; +constexpr uint16_t PIXEL_COUNT = 7; +constexpr uint16_t JEWEL_RING_FIRST = 1; +constexpr uint16_t JEWEL_RING_COUNT = 6; constexpr size_t WAV_HEADER_BYTES = 44; constexpr size_t AUDIO_CHUNK_BYTES = 512; constexpr uint32_t WIFI_RETRY_INTERVAL_MS = 5000; constexpr uint32_t ERROR_HOLD_MS = 1500; Adafruit_NeoPixel pixels(PIXEL_COUNT, PIN_PIXELS, NEO_GRBW + NEO_KHZ800); -I2SClass Audio; enum class RobotState : uint8_t { Boot, @@ -43,25 +50,23 @@ uint32_t state_started_ms = 0; uint32_t last_wifi_attempt_ms = 0; uint32_t last_energy_wake_ms = 0; bool audio_rx_ready = false; -bool boot_event_pending = true; // emit device.boot on first WiFi connect -bool prev_muted_state = false; // edge-detect mute transitions +bool boot_event_pending = true; +bool prev_muted_state = false; -size_t min_size(size_t a, size_t b) { - return a < b ? a : b; -} +size_t min_size(size_t a, size_t b) { return a < b ? a : b; } const char *state_name(RobotState s) { switch (s) { - case RobotState::Boot: return "boot"; - case RobotState::WifiConnecting: return "wifi_connecting"; - case RobotState::Idle: return "idle"; - case RobotState::WakeDetected: return "wake_detected"; - case RobotState::Listening: return "listening"; - case RobotState::Uploading: return "uploading"; - case RobotState::Thinking: return "thinking"; - case RobotState::Speaking: return "speaking"; - case RobotState::Muted: return "muted"; - case RobotState::Error: return "error"; + case RobotState::Boot: return "boot"; + case RobotState::WifiConnecting:return "wifi_connecting"; + case RobotState::Idle: return "idle"; + case RobotState::WakeDetected: return "wake_detected"; + case RobotState::Listening: return "listening"; + case RobotState::Uploading: return "uploading"; + case RobotState::Thinking: return "thinking"; + case RobotState::Speaking: return "speaking"; + case RobotState::Muted: return "muted"; + case RobotState::Error: return "error"; } return "unknown"; } @@ -74,14 +79,10 @@ void set_state(RobotState next) { Serial.println(state_name(state)); } -bool is_muted() { - return digitalRead(PIN_MUTE) == LOW; -} +bool is_muted() { return digitalRead(PIN_MUTE) == LOW; } void set_all(uint32_t color) { - for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { - pixels.setPixelColor(i, color); - } + for (uint16_t i = 0; i < PIXEL_COUNT; ++i) pixels.setPixelColor(i, color); pixels.show(); } @@ -96,7 +97,8 @@ void animate_leds() { case RobotState::WifiConnecting: { pixels.clear(); - const uint16_t active = (now / 120) % PIXEL_COUNT; + const uint16_t active = JEWEL_RING_FIRST + (now / 120) % JEWEL_RING_COUNT; + pixels.setPixelColor(0, pixels.Color(0, 0, 8, 0)); pixels.setPixelColor(active, pixels.Color(0, 0, 48, 0)); pixels.show(); break; @@ -105,8 +107,7 @@ void animate_leds() { case RobotState::Idle: { const uint8_t phase = (now / 28) % 80; const uint8_t triangle = phase < 40 ? phase : 79 - phase; - const uint8_t white = 2 + triangle / 3; - set_all(pixels.Color(0, 0, 0, white)); + set_all(pixels.Color(0, 0, 0, 2 + triangle / 3)); break; } @@ -123,25 +124,21 @@ void animate_leds() { case RobotState::Uploading: { pixels.clear(); - const uint16_t active = (now / 80) % PIXEL_COUNT; - for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { - const uint8_t level = i == active ? 52 : 5; - pixels.setPixelColor(i, pixels.Color(0, 0, level, 0)); - } + const uint16_t active = JEWEL_RING_FIRST + (now / 80) % JEWEL_RING_COUNT; + pixels.setPixelColor(0, pixels.Color(0, 0, 12, 0)); + for (uint16_t i = JEWEL_RING_FIRST; i < JEWEL_RING_FIRST + JEWEL_RING_COUNT; ++i) + pixels.setPixelColor(i, pixels.Color(0, 0, i == active ? 52 : 5, 0)); pixels.show(); break; } case RobotState::Thinking: { pixels.clear(); - const uint16_t active = (now / 100) % PIXEL_COUNT; - for (uint16_t i = 0; i < PIXEL_COUNT; ++i) { - if (i == active) { - pixels.setPixelColor(i, pixels.Color(45, 24, 0, 0)); - } else { - pixels.setPixelColor(i, pixels.Color(4, 2, 0, 0)); - } - } + const uint16_t active = JEWEL_RING_FIRST + (now / 100) % JEWEL_RING_COUNT; + pixels.setPixelColor(0, pixels.Color(18, 9, 0, 0)); + for (uint16_t i = JEWEL_RING_FIRST; i < JEWEL_RING_FIRST + JEWEL_RING_COUNT; ++i) + pixels.setPixelColor(i, i == active ? pixels.Color(45, 24, 0, 0) + : pixels.Color(4, 2, 0, 0)); pixels.show(); break; } @@ -159,65 +156,150 @@ void animate_leds() { case RobotState::Error: set_all((now / 180) % 2 == 0 ? pixels.Color(60, 0, 0, 0) - : pixels.Color(0, 0, 0, 0)); + : pixels.Color(0, 0, 0, 0)); break; } } -uint8_t *alloc_audio_buffer(size_t bytes) { - uint8_t *buffer = - static_cast(heap_caps_malloc(bytes, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)); - if (!buffer) { - buffer = static_cast(malloc(bytes)); +// Runs once at power-on: ring pixels sequence clockwise, centre pops, full +// flash, then off (~650 ms total). Confirms the Jewel is wired and the +// sketch loaded before WiFi bring-up begins. +void play_startup_animation() { + pixels.clear(); + pixels.show(); + for (uint16_t i = 0; i < JEWEL_RING_COUNT; ++i) { + pixels.setPixelColor(JEWEL_RING_FIRST + i, pixels.Color(0, 0, 0, 180)); + pixels.show(); + delay(60); } - return buffer; + pixels.setPixelColor(0, pixels.Color(0, 0, 0, 220)); + pixels.show(); + delay(120); + set_all(pixels.Color(0, 0, 0, 255)); + delay(100); + pixels.clear(); + pixels.show(); + delay(50); } -void put_u16_le(uint8_t *p, uint16_t v) { - p[0] = static_cast(v & 0xff); - p[1] = static_cast((v >> 8) & 0xff); +// C major arpeggio with a G5 bounce before the final resolve — total ~800 ms. +// Square-wave synthesis; gap_ms of silence between notes adds articulation. +void play_startup_chime() { + if (!begin_audio_tx()) return; + + struct Note { uint16_t freq; uint16_t dur_ms; uint16_t gap_ms; }; + static const Note melody[] = { + {523, 75, 18}, // C5 + {659, 75, 18}, // E5 + {784, 75, 18}, // G5 + {1047, 130, 18}, // C6 — first hit + {784, 60, 12}, // G5 — bounce + {1047, 290, 0}, // C6 — resolve + }; + + static int16_t buf[256]; + const int16_t amp = 4000; + + for (size_t ni = 0; ni < sizeof(melody) / sizeof(melody[0]); ++ni) { + const uint32_t freq = melody[ni].freq; + const uint32_t total = (uint32_t)SAMPLE_RATE_HZ * melody[ni].dur_ms / 1000; + const uint32_t period = SAMPLE_RATE_HZ / freq; + uint32_t done = 0; + while (done < total) { + const size_t chunk = min_size(256, total - done); + for (size_t i = 0; i < chunk; ++i) + buf[i] = ((done + i) % period < period / 2) ? amp : -amp; + size_t written = 0; + i2s_write(I2S_NUM_0, buf, chunk * sizeof(int16_t), &written, portMAX_DELAY); + done += chunk; + } + // Articulation gap — silence between notes. + if (melody[ni].gap_ms > 0) { + const uint32_t gap_samples = (uint32_t)SAMPLE_RATE_HZ * melody[ni].gap_ms / 1000; + uint32_t gap_done = 0; + while (gap_done < gap_samples) { + const size_t chunk = min_size(256, gap_samples - gap_done); + memset(buf, 0, chunk * sizeof(int16_t)); + size_t written = 0; + i2s_write(I2S_NUM_0, buf, chunk * sizeof(int16_t), &written, portMAX_DELAY); + gap_done += chunk; + } + } + } + + // Flush so the amp doesn't click when SD goes low. + memset(buf, 0, sizeof(buf)); + size_t written = 0; + i2s_write(I2S_NUM_0, buf, sizeof(buf), &written, portMAX_DELAY); + delay(20); + i2s_stop(); +} + +uint8_t *alloc_audio_buffer(size_t bytes) { + uint8_t *buf = static_cast( + heap_caps_malloc(bytes, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT)); + if (!buf) buf = static_cast(malloc(bytes)); + return buf; } +void put_u16_le(uint8_t *p, uint16_t v) { + p[0] = v & 0xff; p[1] = (v >> 8) & 0xff; +} void put_u32_le(uint8_t *p, uint32_t v) { - p[0] = static_cast(v & 0xff); - p[1] = static_cast((v >> 8) & 0xff); - p[2] = static_cast((v >> 16) & 0xff); - p[3] = static_cast((v >> 24) & 0xff); + p[0] = v & 0xff; p[1] = (v >> 8) & 0xff; + p[2] = (v >> 16) & 0xff; p[3] = (v >> 24) & 0xff; } void write_wav_header(uint8_t *wav, uint32_t pcm_bytes) { - memcpy(wav + 0, "RIFF", 4); - put_u32_le(wav + 4, 36 + pcm_bytes); - memcpy(wav + 8, "WAVE", 4); - memcpy(wav + 12, "fmt ", 4); - put_u32_le(wav + 16, 16); - put_u16_le(wav + 20, 1); - put_u16_le(wav + 22, 1); - put_u32_le(wav + 24, SAMPLE_RATE_HZ); + memcpy(wav + 0, "RIFF", 4); put_u32_le(wav + 4, 36 + pcm_bytes); + memcpy(wav + 8, "WAVE", 4); memcpy(wav + 12, "fmt ", 4); + put_u32_le(wav + 16, 16); put_u16_le(wav + 20, 1); + put_u16_le(wav + 22, 1); put_u32_le(wav + 24, SAMPLE_RATE_HZ); put_u32_le(wav + 28, SAMPLE_RATE_HZ * 2); - put_u16_le(wav + 32, 2); - put_u16_le(wav + 34, 16); - memcpy(wav + 36, "data", 4); - put_u32_le(wav + 40, pcm_bytes); + put_u16_le(wav + 32, 2); put_u16_le(wav + 34, 16); + memcpy(wav + 36, "data", 4); put_u32_le(wav + 40, pcm_bytes); +} + +static void i2s_stop() { + i2s_driver_uninstall(I2S_NUM_0); } bool begin_audio_rx() { - Audio.end(); + i2s_stop(); digitalWrite(PIN_AMP_SD, LOW); delay(10); - Audio.setPins(PIN_I2S_BCLK, PIN_I2S_WS, -1, PIN_I2S_MIC); - if (!Audio.begin(I2S_MODE_STD, SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_32BIT, - I2S_SLOT_MODE_MONO, I2S_STD_SLOT_LEFT)) { - Serial.println("I2S RX init failed"); + // ICS-43434 sends 32-bit I2S frames; audio is in the top 18 bits (MSB-first). + // We read 32-bit and right-shift 16 to produce 16-bit PCM samples. + const i2s_config_t cfg = { + .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX), + .sample_rate = SAMPLE_RATE_HZ, + .bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT, + .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, // SEL pin → GND + .communication_format = I2S_COMM_FORMAT_STAND_I2S, + .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1, + .dma_buf_count = 8, + .dma_buf_len = 128, + .use_apll = false, + .tx_desc_auto_clear = false, + .fixed_mclk = 0, + }; + const i2s_pin_config_t pins = { + .mck_io_num = I2S_PIN_NO_CHANGE, + .bck_io_num = GPIO_I2S_BCLK, + .ws_io_num = GPIO_I2S_WS, + .data_out_num = I2S_PIN_NO_CHANGE, + .data_in_num = GPIO_I2S_MIC, + }; + + if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) { + Serial.println("I2S RX install failed"); audio_rx_ready = false; return false; } - - if (!Audio.configureRX(SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_32BIT, - I2S_SLOT_MODE_MONO, I2S_RX_TRANSFORM_32_TO_16, - I2S_STD_SLOT_LEFT)) { - Serial.println("I2S RX transform failed"); + if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) { + Serial.println("I2S RX pin config failed"); + i2s_stop(); audio_rx_ready = false; return false; } @@ -227,14 +309,38 @@ bool begin_audio_rx() { } bool begin_audio_tx() { - Audio.end(); + i2s_stop(); digitalWrite(PIN_AMP_SD, HIGH); delay(10); - Audio.setPins(PIN_I2S_BCLK, PIN_I2S_WS, PIN_I2S_AMP, -1); - if (!Audio.begin(I2S_MODE_STD, SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_16BIT, - I2S_SLOT_MODE_MONO, I2S_STD_SLOT_BOTH)) { - Serial.println("I2S TX init failed"); + const i2s_config_t cfg = { + .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX), + .sample_rate = SAMPLE_RATE_HZ, + .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, + .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, + .communication_format = I2S_COMM_FORMAT_STAND_I2S, + .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1, + .dma_buf_count = 8, + .dma_buf_len = 128, + .use_apll = false, + .tx_desc_auto_clear = true, // outputs silence when DMA buffer is empty + .fixed_mclk = 0, + }; + const i2s_pin_config_t pins = { + .mck_io_num = I2S_PIN_NO_CHANGE, + .bck_io_num = GPIO_I2S_BCLK, + .ws_io_num = GPIO_I2S_WS, + .data_out_num = GPIO_I2S_AMP, + .data_in_num = I2S_PIN_NO_CHANGE, + }; + + if (i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr) != ESP_OK) { + Serial.println("I2S TX install failed"); + return false; + } + if (i2s_set_pin(I2S_NUM_0, &pins) != ESP_OK) { + Serial.println("I2S TX pin config failed"); + i2s_stop(); return false; } @@ -259,8 +365,7 @@ bool connect_wifi() { } if (WiFi.status() == WL_CONNECTED) { - Serial.print("ip="); - Serial.println(WiFi.localIP()); + Serial.print("ip="); Serial.println(WiFi.localIP()); return true; } @@ -270,12 +375,9 @@ bool connect_wifi() { bool serial_wake_requested() { if (!THREEBO_ENABLE_SERIAL_WAKE) return false; - while (Serial.available() > 0) { const char c = static_cast(Serial.read()); - if (c == 'w' || c == 'W') { - return true; - } + if (c == 'w' || c == 'W') return true; } return false; } @@ -284,37 +386,30 @@ bool energy_wake_detected() { if (!THREEBO_ENABLE_ENERGY_WAKE || !audio_rx_ready) return false; if (millis() - last_energy_wake_ms < 2500) return false; - int16_t samples[128]; - const size_t wanted = sizeof(samples); - const size_t got = Audio.readBytes(reinterpret_cast(samples), wanted); - if (got < wanted) return false; + // Read 128 × 32-bit frames, shift to 16-bit for energy estimate. + int32_t raw[128]; + size_t bytes_read = 0; + i2s_read(I2S_NUM_0, raw, sizeof(raw), &bytes_read, 0); + const size_t n = bytes_read / sizeof(int32_t); + if (n == 0) return false; int64_t sum = 0; - const size_t sample_count = got / sizeof(int16_t); - for (size_t i = 0; i < sample_count; ++i) { - const int32_t sample = samples[i]; - sum += sample < 0 ? -sample : sample; + for (size_t i = 0; i < n; ++i) { + const int32_t s = raw[i] >> 16; + sum += s < 0 ? -s : s; } - const int32_t avg = static_cast(sum / sample_count); + const int32_t avg = static_cast(sum / static_cast(n)); if (avg > THREEBO_ENERGY_WAKE_THRESHOLD) { last_energy_wake_ms = millis(); - Serial.print("energy_wake avg="); - Serial.println(avg); + Serial.print("energy_wake avg="); Serial.println(avg); return true; } - return false; } -bool wake_detected() { - return serial_wake_requested() || energy_wake_detected(); -} +bool wake_detected() { return serial_wake_requested() || energy_wake_detected(); } -// POST a JSON event to the bridge POST /v1/event endpoint. -// Fire-and-forget: the bridge relays it to Arbiter POST /v1/events. -// No-op when THREEBO_ENABLE_EVENTS is false or Wi-Fi is not connected. -// data_json must be a valid JSON object literal (e.g. "{}"). void send_event(const char *type, const char *data_json = "{}") { if (!THREEBO_ENABLE_EVENTS) return; if (WiFi.status() != WL_CONNECTED) return; @@ -322,22 +417,16 @@ void send_event(const char *type, const char *data_json = "{}") { WiFiClient client; HTTPClient http; const String url = String(THREEBO_BRIDGE_BASE_URL) + "/v1/event"; - if (!http.begin(client, url)) return; http.setTimeout(THREEBO_EVENT_TIMEOUT_MS); http.addHeader("Content-Type", "application/json"); http.addHeader("Authorization", String("Bearer ") + THREEBO_DEVICE_SECRET); String body = "{\"type\":\""; - body += type; - body += "\",\"data\":"; - body += data_json; - body += "}"; - + body += type; body += "\",\"data\":"; body += data_json; body += "}"; http.POST(body); http.end(); - Serial.print("event="); - Serial.println(type); + Serial.print("event="); Serial.println(type); } uint8_t *record_utterance_wav(size_t *out_len) { @@ -347,61 +436,57 @@ uint8_t *record_utterance_wav(size_t *out_len) { const uint32_t seconds = THREEBO_RECORD_SECONDS > 0 ? THREEBO_RECORD_SECONDS : 1; const size_t max_pcm_bytes = seconds * SAMPLE_RATE_HZ * sizeof(int16_t); uint8_t *wav = alloc_audio_buffer(WAV_HEADER_BYTES + max_pcm_bytes); - if (!wav) { - Serial.println("audio allocation failed"); - return nullptr; - } + if (!wav) { Serial.println("audio allocation failed"); return nullptr; } write_wav_header(wav, 0); size_t written = 0; - uint8_t *pcm = wav + WAV_HEADER_BYTES; + int16_t *pcm16 = reinterpret_cast(wav + WAV_HEADER_BYTES); const uint32_t started = millis(); + // Temporary 32-bit read buffer: ICS-43434 sends 32-bit frames. + // Right-shift 16 to extract the top 16 bits as the 16-bit PCM sample. + static int32_t tmp32[AUDIO_CHUNK_BYTES / sizeof(int32_t)]; + while (written < max_pcm_bytes && !is_muted()) { const size_t remaining = max_pcm_bytes - written; - const size_t chunk = min_size(AUDIO_CHUNK_BYTES, remaining); - const size_t got = Audio.readBytes(reinterpret_cast(pcm + written), chunk); - - if (got > 0) { - written += got; - } else { - delay(1); + const size_t chunk_samples = min_size( + sizeof(tmp32) / sizeof(int32_t), + remaining / sizeof(int16_t)); + + size_t bytes_read = 0; + i2s_read(I2S_NUM_0, tmp32, chunk_samples * sizeof(int32_t), + &bytes_read, pdMS_TO_TICKS(10)); + + if (bytes_read > 0) { + const size_t n = bytes_read / sizeof(int32_t); + for (size_t i = 0; i < n; ++i) + pcm16[written / sizeof(int16_t) + i] = static_cast(tmp32[i] >> 16); + written += n * sizeof(int16_t); } animate_leds(); - if (millis() - started > (seconds * 1000UL + 500UL)) { - break; - } + if (millis() - started > seconds * 1000UL + 500UL) break; } write_wav_header(wav, written); *out_len = WAV_HEADER_BYTES + written; - Serial.print("recorded_wav_bytes="); - Serial.println(*out_len); + Serial.print("recorded_wav_bytes="); Serial.println(*out_len); return wav; } bool read_response_body(HTTPClient &http, uint8_t **out, size_t *out_len) { - *out = nullptr; - *out_len = 0; + *out = nullptr; *out_len = 0; const int length = http.getSize(); - if (length <= 0) { - Serial.println("bridge response needs Content-Length"); - return false; - } + if (length <= 0) { Serial.println("bridge response needs Content-Length"); return false; } if (static_cast(length) > THREEBO_MAX_RESPONSE_WAV_BYTES) { - Serial.println("bridge response too large"); - return false; + Serial.println("bridge response too large"); return false; } uint8_t *body = alloc_audio_buffer(static_cast(length)); - if (!body) { - Serial.println("response allocation failed"); - return false; - } + if (!body) { Serial.println("response allocation failed"); return false; } WiFiClient *stream = http.getStreamPtr(); size_t read_total = 0; @@ -409,26 +494,23 @@ bool read_response_body(HTTPClient &http, uint8_t **out, size_t *out_len) { while (read_total < static_cast(length) && millis() - started < THREEBO_HTTP_TIMEOUT_MS) { + animate_leds(); const int available = stream->available(); if (available > 0) { - const size_t chunk = - min_size(static_cast(available), static_cast(length) - read_total); - const size_t got = stream->readBytes(reinterpret_cast(body + read_total), chunk); - read_total += got; + const size_t chunk = min_size(static_cast(available), + static_cast(length) - read_total); + read_total += stream->readBytes( + reinterpret_cast(body + read_total), chunk); } else { - animate_leds(); delay(5); } } if (read_total != static_cast(length)) { - free(body); - Serial.println("bridge response read timed out"); - return false; + free(body); Serial.println("bridge response read timed out"); return false; } - *out = body; - *out_len = read_total; + *out = body; *out_len = read_total; return true; } @@ -442,6 +524,7 @@ bool upload_utterance_and_play_response(uint8_t *wav, size_t wav_len) { if (!http.begin(client, url)) { Serial.println("HTTP begin failed"); + free(wav); return false; } @@ -457,8 +540,7 @@ bool upload_utterance_and_play_response(uint8_t *wav, size_t wav_len) { wav = nullptr; if (status != HTTP_CODE_OK) { - Serial.print("bridge status="); - Serial.println(status); + Serial.print("bridge status="); Serial.println(status); http.end(); return false; } @@ -468,17 +550,20 @@ bool upload_utterance_and_play_response(uint8_t *wav, size_t wav_len) { size_t response_len = 0; const bool read_ok = read_response_body(http, &response, &response_len); http.end(); - if (!read_ok) return false; set_state(RobotState::Speaking); - if (!begin_audio_tx()) { - free(response); - return false; + if (!begin_audio_tx()) { free(response); return false; } + + // Write PCM payload directly; skip the 44-byte WAV header. + if (response_len > WAV_HEADER_BYTES) { + const uint8_t *pcm = response + WAV_HEADER_BYTES; + const size_t pcm_len = response_len - WAV_HEADER_BYTES; + size_t written = 0; + i2s_write(I2S_NUM_0, pcm, pcm_len, &written, portMAX_DELAY); } - Audio.playWAV(response, response_len); - Audio.end(); + i2s_stop(); free(response); return true; } @@ -486,10 +571,7 @@ bool upload_utterance_and_play_response(uint8_t *wav, size_t wav_len) { void handle_turn() { set_state(RobotState::WakeDetected); const uint32_t flash_started = millis(); - while (millis() - flash_started < 250) { - animate_leds(); - delay(10); - } + while (millis() - flash_started < 250) { animate_leds(); delay(10); } if (is_muted()) return; @@ -499,7 +581,7 @@ void handle_turn() { size_t wav_len = 0; uint8_t *wav = record_utterance_wav(&wav_len); - Audio.end(); + i2s_stop(); audio_rx_ready = false; if (!wav || wav_len <= WAV_HEADER_BYTES || is_muted()) { @@ -514,10 +596,7 @@ void handle_turn() { send_event("device.error", "{\"source\":\"bridge\"}"); set_state(RobotState::Error); const uint32_t error_started = millis(); - while (millis() - error_started < ERROR_HOLD_MS) { - animate_leds(); - delay(20); - } + while (millis() - error_started < ERROR_HOLD_MS) { animate_leds(); delay(20); } } begin_audio_rx(); @@ -529,6 +608,7 @@ void setup() { delay(300); pinMode(PIN_MUTE, INPUT_PULLUP); + prev_muted_state = is_muted(); pinMode(PIN_AMP_SD, OUTPUT); digitalWrite(PIN_AMP_SD, LOW); @@ -537,6 +617,9 @@ void setup() { pixels.clear(); pixels.show(); + play_startup_animation(); + play_startup_chime(); + set_state(RobotState::Boot); animate_leds(); @@ -554,12 +637,9 @@ void setup() { void loop() { animate_leds(); - if (!connect_wifi()) { - delay(25); - return; - } + if (!connect_wifi()) { delay(25); return; } - if (state == RobotState::WifiConnecting) { + if (boot_event_pending || state == RobotState::WifiConnecting) { const String ip = WiFi.localIP().toString(); if (boot_event_pending) { boot_event_pending = false; @@ -582,19 +662,10 @@ void loop() { send_event(cur_muted ? "device.mute" : "device.unmute", "{}"); } - if (cur_muted) { - set_state(RobotState::Muted); - delay(25); - return; - } - - if (state == RobotState::Muted) { - set_state(RobotState::Idle); - } + if (cur_muted) { set_state(RobotState::Muted); delay(25); return; } + if (state == RobotState::Muted) set_state(RobotState::Idle); - if (state == RobotState::Idle && wake_detected()) { - handle_turn(); - } + if (state == RobotState::Idle && wake_detected()) handle_turn(); delay(5); }