diff --git a/README.md b/README.md index 308d490..4cbf3d8 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Welcome to the Voice Assistant project! 🎙️ Our goal is to create a modular ## Features 🧰 - **Modular Design**: Easily switch between different models for transcription, response generation, and TTS. -- **Support for Multiple APIs**: Integrates with OpenAI, Groq, and Deepgram APIs, along with placeholders for local models. +- **Support for Multiple APIs**: Integrates with OpenAI, Groq, Deepgram, ElevenLabs, and 60db.ai APIs, along with placeholders for local models. - **Audio Recording and Playback**: Record audio from the microphone and play generated speech. - **Configuration Management**: Centralized configuration in `config.py` for easy setup and management. @@ -84,6 +84,9 @@ Create a `.env` file in the root directory and add your API keys: OPENAI_API_KEY=your_openai_api_key GROQ_API_KEY=your_groq_api_key DEEPGRAM_API_KEY=your_deepgram_api_key + ELEVENLABS_API_KEY=your_elevenlabs_api_key + SIXTYDB_API_KEY=your_60db_api_key + SIXTYDB_VOICE_ID=fbb75ed2-975a-40c7-9e06-38e30524a9a1 LOCAL_MODEL_PATH=path/to/local/model PIPER_SERVER_URL=server_url ``` @@ -96,12 +99,14 @@ Edit config.py to select the models you want to use: # Model selection TRANSCRIPTION_MODEL = 'groq' # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local' RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'local' - TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts', 'piper' + TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'sixtydb', 'local', 'melotts', 'piper' # API keys and paths OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") GROQ_API_KEY = os.getenv("GROQ_API_KEY") DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") + ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") + SIXTYDB_API_KEY = os.getenv("SIXTYDB_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") ``` @@ -112,12 +117,19 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the - Follow this [link](https://elevenlabs.io/app/voice-lab/share/de3746fa51a09e771604d74b5d1ff6797b6b96a5958f9de95cef544dde31dad9/WArWzu0z4mbSyy5BfRKM) to add the Jarvis voice to your ElevenLabs account. - Name the voice 'Paul J.' or, if you prefer a different name, ensure it matches the ELEVENLABS_VOICE_ID variable in the text_to_speech.py file. -7. 🏃 **Run the voice assistant** +7. 🔊 **Configure 60db.ai TTS** +- Sign up at [60db.ai](https://60db.ai) and get your API key from the dashboard. +- Add `SIXTYDB_API_KEY` to your `.env` file. +- Optionally, set `SIXTYDB_VOICE_ID` in your `.env` to use a custom voice (defaults to the platform default voice). +- You can browse your available voices via the [Voices API](https://docs.60db.ai/api-reference/voices/get-my-voices). +- Set `TTS_MODEL = 'sixtydb'` in `config.py` to use 60db.ai. + +8. 🏃 **Run the voice assistant** ```shell python run_voice_assistant.py ``` -8. 🎤 **Install FastWhisperAPI** +9. 🎤 **Install FastWhisperAPI** _Optional step if you need a local transcription model_ @@ -151,7 +163,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the ``` Refer to the repository documentation for the Google Colab method: https://github.com/3choff/FastWhisperAPI/blob/main/README.md -8. 🎤 **Install Local TTS - MeloTTS** +10. 🎤 **Install Local TTS - MeloTTS** _Optional step if you need a local Text to Speech model_ @@ -167,10 +179,10 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the In order to use the local TTS model, you will need to update the `config.py` file by setting: ```shell - TTS_MODEL = 'melotts' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts', 'piper' + TTS_MODEL = 'melotts' # Options: 'openai', 'deepgram', 'elevenlabs', 'sixtydb', 'local', 'melotts', 'piper' ``` -9. 🎤 **Install Local TTS - Piper** +11. 🎤 **Install Local TTS - Piper** _A faster and lightweight alternative to MeloTTS_ @@ -206,7 +218,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the In order to use the local TTS model, you will need to update the `config.py` file by setting: ```shell - TTS_MODEL = 'piper' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts','piper' + TTS_MODEL = 'piper' # Options: 'openai', 'deepgram', 'elevenlabs', 'sixtydb', 'local', 'melotts','piper' ``` You can run the main file to start using verbi with local models. @@ -232,6 +244,10 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice. - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice. - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice. +- **60db.ai**: Uses 60db.ai's TTS API with the default voice (configurable via `SIXTYDB_VOICE_ID`). Supports multiple languages including English and Indic languages (Hindi, Bengali, Tamil, etc.). +- **Cartesia**: Uses Cartesia's Sonic English model with real-time streaming playback. +- **MeloTTS**: Uses the local MeloTTS model (requires local setup). +- **Piper**: Uses the local Piper model — a fast and lightweight alternative (requires local setup). - **Local**: Placeholder for a local TTS model. ## Detailed Module Descriptions 📘 @@ -253,7 +269,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the Here's what's next for the Voice Assistant project: 1. **Add Support for Streaming**: Enable real-time streaming of audio input and output. -2. **Add Support for ElevenLabs and Enhanced Deepgram for TTS**: Integrate additional TTS options for higher quality and variety. +2. ~~**Add Support for ElevenLabs and Enhanced Deepgram for TTS**~~: Integrated ElevenLabs, 60db.ai, and Cartesia TTS options. ✅ 3. **Add Filler Audios**: Include background or filler audios while waiting for model responses to enhance user experience. 4. **Add Support for Local Models Across the Board**: Expand support for local models in transcription, response generation, and TTS. diff --git a/example.env b/example.env index 05527b1..b26742a 100644 --- a/example.env +++ b/example.env @@ -2,6 +2,8 @@ OPENAI_API_KEY="OPENAI_API_KEY" GROQ_API_KEY="GROQ_API_KEY" DEEPGRAM_API_KEY="DEEPGRAM_API_KEY" ELEVENLABS_API_KEY="ELEVENLABS_API_KEY" +SIXTYDB_API_KEY="SIXTYDB_API_KEY" +SIXTYDB_VOICE_ID="fbb75ed2-975a-40c7-9e06-38e30524a9a1" CARTESIA_API_KEY="CARTESIA_API_KEY" LOCAL_MODEL_PATH=path/to/local/model PIPER_SERVER_URL=http://localhost:5000 diff --git a/run_voice_assistant.py b/run_voice_assistant.py index 418e1c0..7ad7fbf 100644 --- a/run_voice_assistant.py +++ b/run_voice_assistant.py @@ -65,7 +65,7 @@ def main(): chat_history.append({"role": "assistant", "content": response_text}) # Determine the output file format based on the TTS model - if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia': + if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'sixtydb' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia': output_file = 'output.mp3' else: output_file = 'output.wav' diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 68668e3..b9e8e5a 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -15,7 +15,8 @@ "tts": { "openai": Config.OPENAI_API_KEY, "deepgram":Config.DEEPGRAM_API_KEY, - "elevenlabs": Config.ELEVENLABS_API_KEY + "elevenlabs": Config.ELEVENLABS_API_KEY, + "sixtydb": Config.SIXTYDB_API_KEY } } diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 73fbc87..18fe859 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -13,17 +13,19 @@ class Config: Attributes: TRANSCRIPTION_MODEL (str): The model to use for transcription ('openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'). RESPONSE_MODEL (str): The model to use for response generation ('openai', 'groq', 'local'). - TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'local'). + TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'sixtydb', 'local'). OPENAI_API_KEY (str): API key for OpenAI services. GROQ_API_KEY (str): API key for Groq services. DEEPGRAM_API_KEY (str): API key for Deepgram services. ELEVENLABS_API_KEY (str): API key for ElevenLabs services. + SIXTYDB_API_KEY (str): API key for 60db.ai services. + SIXTYDB_VOICE_ID (str): Voice ID for 60db.ai TTS. LOCAL_MODEL_PATH (str): Path to the local model. """ # Model selection TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama - TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper + TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, sixtydb, melotts, cartesia, piper # Piper Server configuration PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL") @@ -42,6 +44,8 @@ class Config: GROQ_API_KEY = os.getenv("GROQ_API_KEY") DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") + SIXTYDB_API_KEY = os.getenv("SIXTYDB_API_KEY") + SIXTYDB_VOICE_ID = os.getenv("SIXTYDB_VOICE_ID", "fbb75ed2-975a-40c7-9e06-38e30524a9a1") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") @@ -64,7 +68,7 @@ def validate_config(): Config._validate_model('RESPONSE_MODEL', [ 'openai', 'groq', 'ollama', 'local']) Config._validate_model('TTS_MODEL', [ - 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper']) + 'openai', 'deepgram', 'elevenlabs', 'sixtydb', 'melotts', 'cartesia', 'local', 'piper']) Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY') @@ -76,6 +80,7 @@ def validate_config(): Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY') + Config._validate_api_key('TTS_MODEL', 'sixtydb', 'SIXTYDB_API_KEY') Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY') @staticmethod diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index be3ee96..9a21876 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -1,6 +1,7 @@ # voice_assistant/text_to_speech.py import logging import json +import base64 import pyaudio import elevenlabs import soundfile as sf @@ -58,6 +59,34 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca model="eleven_turbo_v2" ) elevenlabs.save(audio, output_file_path) + + elif model == 'sixtydb': + response = requests.post( + "https://api.60db.ai/tts-synthesize", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + json={ + "text": text, + "voice_id": Config.SIXTYDB_VOICE_ID, + "enhance": True, + "speed": 1, + "stability": 50, + "similarity": 75, + "output_format": "mp3" + } + ) + response.raise_for_status() + data = response.json() + + if not data.get("success"): + raise ValueError(f"60db TTS failed: {data.get('message', 'Unknown error')}") + + audio_bytes = base64.b64decode(data["audio_base64"]) + with open(output_file_path, "wb") as f: + f.write(audio_bytes) + logging.info(f"60db TTS audio saved to {output_file_path} ({data.get('duration_seconds', '?')}s)") elif model == "cartesia": client = Cartesia(api_key=api_key)