diff --git a/kos-py/pykos/client.py b/kos-py/pykos/client.py index 2c0a8f4..5bd0500 100644 --- a/kos-py/pykos/client.py +++ b/kos-py/pykos/client.py @@ -12,6 +12,7 @@ from pykos.services.process_manager import ProcessManagerServiceClient from pykos.services.sim import SimServiceClient from pykos.services.sound import SoundServiceClient +from pykos.services.speech import SpeechServiceClient class KOS: @@ -23,6 +24,13 @@ class KOS: Attributes: imu (IMUServiceClient): Client for the IMU service. + actuator (ActuatorServiceClient): Client for the actuator service. + led_matrix (LEDMatrixServiceClient): Client for the LED matrix service. + sound (SoundServiceClient): Client for the sound service. + process_manager (ProcessManagerServiceClient): Client for the process manager service. + inference (InferenceServiceClient): Client for the inference service. + sim (SimServiceClient): Client for the simulation service. + speech (SpeechServiceClient): Client for the speech service. """ def __init__(self, ip: str = "localhost", port: int = 50051) -> None: @@ -36,6 +44,7 @@ def __init__(self, ip: str = "localhost", port: int = 50051) -> None: self._process_manager: ProcessManagerServiceClient | None = None self._inference: InferenceServiceClient | None = None self._sim: SimServiceClient | None = None + self._speech: SpeechServiceClient | None = None @property def imu(self) -> IMUServiceClient: @@ -79,14 +88,21 @@ def sim(self) -> SimServiceClient: raise RuntimeError("Sim client not initialized! Must call __aenter__() first.") return self._sim + @property + def speech(self) -> SpeechServiceClient: + if self._speech is None: + raise RuntimeError("Speech client not initialized! Must call __aenter__() first.") + return self._speech + async def connect(self) -> None: """Connect to the gRPC server and initialize service clients.""" self._channel = grpc.aio.insecure_channel(f"{self.ip}:{self.port}") + self._process_manager = ProcessManagerServiceClient(self._channel) self._imu = IMUServiceClient(self._channel) self._actuator = ActuatorServiceClient(self._channel) self._led_matrix = LEDMatrixServiceClient(self._channel) self._sound = SoundServiceClient(self._channel) - self._process_manager = ProcessManagerServiceClient(self._channel) + self._speech = SpeechServiceClient(self._channel) self._inference = InferenceServiceClient(self._channel) self._sim = SimServiceClient(self._channel) diff --git a/kos-py/pykos/services/speech.py b/kos-py/pykos/services/speech.py new file mode 100644 index 0000000..bb0d139 --- /dev/null +++ b/kos-py/pykos/services/speech.py @@ -0,0 +1,61 @@ +"""Speech service client.""" + +import grpc +import grpc.aio + +from kos_protos import speech_pb2, speech_pb2_grpc + + +class SpeechServiceClient: + """Client for the SpeechService. + + This service provides text-to-speech synthesis and speech-to-text transcription. + """ + + def __init__(self, channel: grpc.aio.Channel) -> None: + """Initialize the speech service client. + + Args: + channel: gRPC channel to use for communication. + """ + self.stub = speech_pb2_grpc.SpeechServiceStub(channel) + + async def synthesize(self, text: str) -> speech_pb2.SynthesizeResponse: + """Synthesize speech from text. + + Args: + text: Text to synthesize + + Returns: + Output file to the synthesized speech. + + Raises: + RuntimeError: If synthesis fails. + """ + request = speech_pb2.SynthesizeRequest(text=text) + + response = await self.stub.Synthesize(request) + if response.HasField("error"): + raise RuntimeError(f"Synthesis error: {response.error}") + return response.file_path + + async def transcribe(self, audio_data: str) -> str: + """Transcribe speech to text. + + Args: + audio_data: Audio data to transcribe + + Returns: + Transcribed text. + + Raises: + RuntimeError: If transcription fails. + """ + request = speech_pb2.TranscribeRequest( + audio_data=audio_data, + ) + + response = await self.stub.Transcribe(request) + if response.HasField("error"): + raise RuntimeError(f"Transcription error: {response.error}") + return response.text diff --git a/kos-py/tests/test_pykos.py b/kos-py/tests/test_pykos.py deleted file mode 100644 index 0c66bef..0000000 --- a/kos-py/tests/test_pykos.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Defines a dummy test.""" - -import grpc -import pytest - -import pykos - - -def test_dummy() -> None: - assert True - - -@pytest.mark.asyncio -async def test_pykos() -> None: - # In order to test this client, you should run the stub KOS server. - # This can be done from the parent directory with `cargo run --bin kos-stub` - if not _is_server_running("127.0.0.1:50051"): - pytest.skip("No active gRPC server at 127.0.0.1:50051") - - # Tests configuring the actuator. - async with pykos.KOS("127.0.0.1") as client: - actuator_response = await client.actuator.configure_actuator(actuator_id=1) - assert actuator_response.success - - # Tests getting the actuator state. - actuator_state = await client.actuator.get_actuators_state(actuator_ids=[1]) - assert actuator_state.states[0].actuator_id == 1 - - # Tests the IMU endpoints. - imu_response = await client.imu.get_imu_values() - assert imu_response.accel_x is not None - await client.imu.get_imu_advanced_values() - await client.imu.get_euler_angles() - await client.imu.get_quaternion() - await client.imu.calibrate() - zero_response = await client.imu.zero(duration=1.0, max_retries=1, max_angular_error=1.0) - assert zero_response.success - - # Tests the K-Clip endpoints. - start_kclip_response = await client.process_manager.start_kclip(action="start") - assert start_kclip_response.clip_uuid is not None - stop_kclip_response = await client.process_manager.stop_kclip() - assert stop_kclip_response.clip_uuid is not None - - -def _is_server_running(address: str) -> bool: - try: - channel = grpc.insecure_channel(address) - grpc.channel_ready_future(channel).result(timeout=1) - return True - except grpc.FutureTimeoutError: - return False diff --git a/kos-stub/src/lib.rs b/kos-stub/src/lib.rs index 8cba942..8874a04 100644 --- a/kos-stub/src/lib.rs +++ b/kos-stub/src/lib.rs @@ -1,15 +1,21 @@ mod actuator; mod imu; mod process_manager; +mod speech; use crate::actuator::StubActuator; use crate::imu::StubIMU; use crate::process_manager::StubProcessManager; +use crate::speech::StubSpeech; use async_trait::async_trait; use kos::hal::Operation; use kos::kos_proto::actuator::actuator_service_server::ActuatorServiceServer; use kos::kos_proto::imu::imu_service_server::ImuServiceServer; use kos::kos_proto::process_manager::process_manager_service_server::ProcessManagerServiceServer; -use kos::services::{ActuatorServiceImpl, IMUServiceImpl, ProcessManagerServiceImpl}; +use kos::kos_proto::speech::speech_service_server::SpeechServiceServer; +use kos::services::{ + ActuatorServiceImpl, IMUServiceImpl, ProcessManagerServiceImpl, SpeechServiceImpl, +}; + use kos::{services::OperationsServiceImpl, Platform, ServiceEnum}; use std::future::Future; use std::pin::Pin; @@ -52,6 +58,7 @@ impl Platform for StubPlatform { let actuator = StubActuator::new(operations_service.clone()); let imu = StubIMU::new(operations_service.clone()); let process_manager = StubProcessManager::new(); + let speech = StubSpeech::new(); Ok(vec![ ServiceEnum::Actuator(ActuatorServiceServer::new(ActuatorServiceImpl::new( @@ -61,6 +68,9 @@ impl Platform for StubPlatform { ProcessManagerServiceImpl::new(Arc::new(process_manager)), )), ServiceEnum::Imu(ImuServiceServer::new(IMUServiceImpl::new(Arc::new(imu)))), + ServiceEnum::Speech(SpeechServiceServer::new(SpeechServiceImpl::new(Arc::new( + speech, + )))), ]) }) } diff --git a/kos-stub/src/speech.rs b/kos-stub/src/speech.rs new file mode 100644 index 0000000..000f75e --- /dev/null +++ b/kos-stub/src/speech.rs @@ -0,0 +1,32 @@ +use async_trait::async_trait; +use eyre::Result; +use kos::hal::Speech; +use kos::kos_proto::speech::SynthesizeResponse; +use std::process::Command; +use uuid::Uuid; +pub struct StubSpeech {} + +impl Default for StubSpeech { + fn default() -> Self { + Self::new() + } +} + +impl StubSpeech { + pub fn new() -> Self { + StubSpeech {} + } +} + +#[async_trait] +impl Speech for StubSpeech { + async fn synthesize(&self, text: String) -> Result { + // Generate a unique filename for the wav output + let output_file = format!("synthesize_{}.wav", Uuid::new_v4()); + + Ok(SynthesizeResponse { + file_path: output_file, + error: None, + }) + } +} diff --git a/kos/build.rs b/kos/build.rs index cc4a475..f09f9ea 100644 --- a/kos/build.rs +++ b/kos/build.rs @@ -6,7 +6,10 @@ fn main() { let proto_root = "proto"; // Where to output the compiled Rust files - let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let out_dir = PathBuf::from(match env::var("OUT_DIR") { + Ok(dir) => dir, + Err(e) => panic!("Failed to get OUT_DIR: {}", e), + }); // List of Protobuf files let protos = [ @@ -19,6 +22,7 @@ fn main() { "kos/system.proto", "kos/led_matrix.proto", "kos/sound.proto", + "kos/speech.proto", "google/longrunning/operations.proto", ]; diff --git a/kos/proto/kos/speech.proto b/kos/proto/kos/speech.proto new file mode 100644 index 0000000..afe0a1c --- /dev/null +++ b/kos/proto/kos/speech.proto @@ -0,0 +1,44 @@ +syntax = "proto3"; + +package kos.speech; + +import "google/protobuf/empty.proto"; +import "kos/common.proto"; + +option go_package = "kos/speech;speech"; +option java_package = "com.kos.speech"; +option csharp_namespace = "KOS.Speech"; + +// The SpeechService provides methods to transcribe or synthesize speech +service SpeechService { + // Transcribes speech to text + rpc Transcribe(TranscribeRequest) returns (TranscribeResponse); + + // Synthesizes speech from text + rpc Synthesize(SynthesizeRequest) returns (SynthesizeResponse); +} + +enum Model { + ESPEAK_NG = 0; + KMODEL = 1; + LOCAL = 2; +} + +message TranscribeRequest { + string audio_data = 1; +} + +message TranscribeResponse { + string text = 1; + kos.common.Error error = 2; +} + +message SynthesizeRequest { + string text = 1; + Model model = 2; +} + +message SynthesizeResponse { + string file_path = 1; + kos.common.Error error = 2; +} diff --git a/kos/src/daemon.rs b/kos/src/daemon.rs index a69ba83..34bafa6 100644 --- a/kos/src/daemon.rs +++ b/kos/src/daemon.rs @@ -40,6 +40,7 @@ fn add_service_to_router( ServiceEnum::Inference(svc) => router.add_service(svc), ServiceEnum::LEDMatrix(svc) => router.add_service(svc), ServiceEnum::Sound(svc) => router.add_service(svc), + ServiceEnum::Speech(svc) => router.add_service(svc), } } diff --git a/kos/src/grpc_interface.rs b/kos/src/grpc_interface.rs index 8c21486..b7ec1a4 100644 --- a/kos/src/grpc_interface.rs +++ b/kos/src/grpc_interface.rs @@ -30,6 +30,10 @@ pub mod kos { pub mod sound { tonic::include_proto!("kos/kos.sound"); } + + pub mod speech { + tonic::include_proto!("kos/kos.speech"); + } } pub mod google { diff --git a/kos/src/hal.rs b/kos/src/hal.rs index 77f9e1e..213a732 100644 --- a/kos/src/hal.rs +++ b/kos/src/hal.rs @@ -3,7 +3,7 @@ pub use crate::grpc_interface::kos; pub use crate::grpc_interface::kos::common::ActionResponse; pub use crate::kos_proto::{ actuator::*, common::ActionResult, imu::*, inference::*, led_matrix::*, process_manager::*, - sound::*, + sound::*, speech::*, }; use async_trait::async_trait; use bytes::Bytes; @@ -44,12 +44,6 @@ pub trait IMU: Send + Sync { async fn get_quaternion(&self) -> Result; } -#[async_trait] -pub trait ProcessManager: Send + Sync { - async fn start_kclip(&self, action: String) -> Result; - async fn stop_kclip(&self) -> Result; -} - #[async_trait] pub trait Inference: Send + Sync { async fn upload_model( @@ -107,6 +101,17 @@ pub trait Sound: Send + Sync { async fn stop_recording(&self) -> Result; } +#[async_trait] +pub trait ProcessManager: Send + Sync { + async fn start_kclip(&self, action: String) -> Result; + async fn stop_kclip(&self) -> Result; +} + +#[async_trait] +pub trait Speech: Send + Sync { + async fn synthesize(&self, text: String) -> Result; +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CalibrationStatus { Calibrating, diff --git a/kos/src/lib.rs b/kos/src/lib.rs index b4da246..8453843 100644 --- a/kos/src/lib.rs +++ b/kos/src/lib.rs @@ -20,10 +20,11 @@ use hal::inference_service_server::InferenceServiceServer; use hal::led_matrix_service_server::LedMatrixServiceServer; use hal::process_manager_service_server::ProcessManagerServiceServer; use hal::sound_service_server::SoundServiceServer; +use hal::speech_service_server::SpeechServiceServer; use services::OperationsServiceImpl; use services::{ ActuatorServiceImpl, IMUServiceImpl, InferenceServiceImpl, LEDMatrixServiceImpl, - ProcessManagerServiceImpl, SoundServiceImpl, + ProcessManagerServiceImpl, SoundServiceImpl, SpeechServiceImpl, }; use std::fmt::Debug; use std::future::Future; @@ -66,6 +67,12 @@ impl Debug for SoundServiceImpl { } } +impl Debug for SpeechServiceImpl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "SpeechServiceImpl") + } +} + #[derive(Debug)] pub enum ServiceEnum { Actuator(ActuatorServiceServer), @@ -74,6 +81,7 @@ pub enum ServiceEnum { Inference(InferenceServiceServer), LEDMatrix(LedMatrixServiceServer), Sound(SoundServiceServer), + Speech(SpeechServiceServer), } #[async_trait] diff --git a/kos/src/services/mod.rs b/kos/src/services/mod.rs index 075c804..33a81e9 100644 --- a/kos/src/services/mod.rs +++ b/kos/src/services/mod.rs @@ -6,6 +6,7 @@ mod led_matrix; mod operations; mod process_manager; mod sound; +mod speech; pub use actuator::*; pub use imu::*; @@ -15,3 +16,4 @@ pub use led_matrix::*; pub use operations::*; pub use process_manager::*; pub use sound::*; +pub use speech::*; diff --git a/kos/src/services/speech.rs b/kos/src/services/speech.rs new file mode 100644 index 0000000..9efaaf8 --- /dev/null +++ b/kos/src/services/speech.rs @@ -0,0 +1,42 @@ +use crate::hal::Speech; +use crate::kos_proto::speech::speech_service_server::SpeechService; +use crate::kos_proto::speech::*; +use std::sync::Arc; +use tonic::{Request, Response, Status}; +use tracing::trace; + +pub struct SpeechServiceImpl { + speech: Arc, +} + +impl SpeechServiceImpl { + pub fn new(speech: Arc) -> Self { + Self { speech } + } +} + +#[tonic::async_trait] +impl SpeechService for SpeechServiceImpl { + async fn synthesize( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + trace!("Synthesizing text: {}", req.text); + + let response = self + .speech + .synthesize(req.text) + .await + .map_err(|e| Status::internal(format!("Failed to synthesize text, {:?}", e)))?; + + Ok(Response::new(response)) + } + + async fn transcribe( + &self, + request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Transcribe not implemented")) + } +}