diff --git a/CITATION.cff b/CITATION.cff index 26a0664..97323d7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,85 +1,46 @@ cff-version: 1.2.0 -message: If you use this software, please cite it using the metadata from this file. -title: AGI-Pipeline -version: 1.0.0 +message: "If you use this software, please cite it using the metadata from this file." +title: "AGI-Pipeline" +version: 1.0.1 date-released: '2024-12-17' license: MIT -repository-code: https://github.com/OneFineStarstuff/AGI-Pipeline +repository-code: "https://github.com/OneFineStarstuff/AGI-Pipeline" doi: 10.5281/zenodo.14504697 authors: - - given-names: Kyaw T. - family-names: Tun + - given-names: "Kyaw T." + family-names: "Tun" abstract: >- - The AGI (Artificial General Intelligence) Pipeline is a comprehensive and - modular software framework designed to integrate various AI capabilities, - including Natural Language Processing (NLP), Computer Vision (CV), Multi-Modal - Processing, Reinforcement Learning (RL), and Real-Time Video Processing. This - pipeline leverages state-of-the-art models and techniques to provide a robust - and scalable solution for diverse AI tasks. + The AGI (Artificial General Intelligence) Pipeline is an enterprise-grade multimodal framework designed to integrate core AI capabilities, including Natural Language Processing (NLP), Computer Vision (CV), and Speech Processing. This pipeline leverages state-of-the-art models and adheres to the Enterprise AI Agent Interoperability Protocol (EAIP) to provide a robust and scalable solution. - The AGI Pipeline is built to facilitate seamless integration and interaction - between different AI modules, enabling the development of sophisticated AI - applications. Key features of the pipeline include: + Key features include: 1. Natural Language Processing (NLP): - - Utilizes the BART (Bidirectional and Auto-Regressive Transformers) model for text summarization and other NLP tasks. - - Provides efficient and accurate text processing capabilities. + - Utilizes the FLAN-T5 model for high-quality text generation and conditional responses. 2. Computer Vision (CV): - - Employs the ResNet50 model for image classification, leveraging pre-trained weights from ImageNet. - - Supports advanced data augmentation techniques using the Albumentations library to enhance model robustness. + - Employs YOLOv8 for real-time object detection and image analysis. - 3. Multi-Modal Processing: - - Integrates the CLIP (Contrastive Language–Image Pretraining) model to process and understand text and image inputs simultaneously. - - Enables tasks such as image captioning and scene understanding. + 3. Speech Processing: + - Incorporates OpenAI's Whisper for high-accuracy speech-to-text transcription. + - Features offline text-to-speech synthesis using Pyttsx3. - 4. Reinforcement Learning (RL): - - Implements the PPO (Proximal Policy Optimization) algorithm from the Stable-Baselines3 library for training RL agents. - - Includes a custom environment for RL tasks, allowing for flexible and dynamic training scenarios. + 4. Security and Interoperability: + - Implements JWT-based authentication for secure API access. + - Complies with EAIP specifications, including gRPC support and SPIFFE/SPIRE identity management. - 5. Real-Time Video Processing: - - Supports real-time video processing using OpenCV, enabling live video feed analysis and processing. - - Provides a robust framework for handling real-time data streams. - - 6. Voice and Speech Integration: - - Incorporates speech-to-text and text-to-speech capabilities using libraries like Google Speech Recognition and pyttsx3. - - Facilitates voice-based interactions and processing. - - 7. Interactive Visualization: - - Utilizes Plotly for dynamic and interactive data visualization, creating insightful visual representations of data and model performance. - - 8. Deployment and Scalability: - - Designed for easy deployment to cloud platforms such as AWS, GCP, and Heroku. - - Ensures scalability and performance optimization for handling large-scale AI tasks. - - 9. Comprehensive Testing and Validation: - - Implements unit tests and integration tests using PyTest to ensure the robustness and reliability of the pipeline. - - 10. User Interface: - - Provides a web-based user interface using frameworks like Flask and React for easy interaction with the pipeline. - - The AGI Pipeline is a versatile and powerful tool for researchers, developers, - and AI enthusiasts, enabling the creation of advanced AI applications with - ease and efficiency. + 5. Scalable Deployment: + - Built with FastAPI for high-performance asynchronous request handling. + - Fully containerized with Docker for seamless deployment across cloud environments. keywords: - Artificial General Intelligence (AGI) - Natural Language Processing (NLP) - Computer Vision (CV) - - Multi-Modal Processing - - Reinforcement Learning (RL) - - Real-Time Video Processing - - Data Augmentation - - Speech Recognition - - Text-to-Speech - - Machine Learning (ML) - - Data Science - - AI Pipeline - - Deep Learning - - Model Integration - - Cloud Deployment - - Interactive Visualization - - Voice Processing - - AI Applications - - Docker + - Speech Processing + - Whisper + - YOLOv8 + - FLAN-T5 - FastAPI + - Docker + - JWT + - EAIP diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..db85b0a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,43 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at your.email@example.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..4772dd0 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,59 @@ +# Contributing to AGI Pipeline + +First off, thank you for considering contributing to the AGI Pipeline! It's people like you that make the open-source community such an amazing place to learn, inspire, and create. + +## Code of Conduct + +By participating in this project, you are expected to uphold our [Code of Conduct](CODE_OF_CONDUCT.md). + +## How Can I Contribute? + +### Reporting Bugs + +- **Check for existing issues**: Before opening a new issue, please search the issue tracker to see if the bug has already been reported. +- **Provide detail**: Include steps to reproduce the bug, expected behavior, and any relevant logs or screenshots. + +### Suggesting Enhancements + +- **Open an issue**: Describe the enhancement you'd like to see and why it would be useful. +- **Discuss**: Participate in the discussion to refine the idea. + +### Pull Requests + +1. **Fork the repository** and create your branch from `main`. +2. **Install dependencies**: `pip install -r requirements.txt`. +3. **Follow the coding style**: + - We use **Pylint** and **Flake8** for linting. + - All modules, classes, and methods must have docstrings. + - Follow PEP 8 guidelines. + - Your code must achieve a 10.0/10 Pylint score. +4. **Write tests**: Ensure your changes are covered by tests in the `tests/` directory or as separate `test_*.py` files. +5. **Run tests**: Execute `pytest` to verify your changes. +6. **Submit the PR**: Provide a clear and descriptive pull request message. + +## Development Environment + +### Prerequisites + +- Python 3.10+ +- FFmpeg +- espeak-ng + +### Testing + +Run the test suite using: +```bash +pytest +``` + +### Linting + +Check your code quality with: +```bash +pylint main.py +flake8 main.py +``` + +## License + +By contributing, you agree that your contributions will be licensed under its [MIT License](LICENSE). diff --git a/README.md b/README.md index 00f34b8..7baff26 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,99 @@ # AGI Pipeline +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14504697.svg)](https://doi.org/10.5281/zenodo.14504697) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + ## Overview -A comprehensive AGI pipeline integrating NLP, Computer Vision, and Speech Processing using pre-trained models. +The AGI Pipeline is an enterprise-grade multimodal AGI system integrating Natural Language Processing (NLP), Computer Vision (CV), and Speech Processing. It is built on FastAPI and adheres to the Enterprise AI Agent Interoperability Protocol (EAIP). ## Features -- Text generation with T5 -- Object detection with YOLO -- Speech-to-text with Whisper -- Text-to-speech with Pyttsx3 + +- **NLP**: Text generation and conditional responses using Google's FLAN-T5. +- **Computer Vision**: Real-time object detection using YOLOv8. +- **Speech-to-Text**: High-accuracy audio transcription using OpenAI's Whisper. +- **Text-to-Speech**: Offline speech synthesis using Pyttsx3. +- **Security**: Robust JWT-based authentication for all API endpoints. +- **EAIP Compliant**: Implements gRPC over HTTP/2 and SPIFFE/SPIRE for identity management. ## Installation +### Prerequisites + +- Python 3.10+ +- FFmpeg (for speech processing) +- espeak-ng (for text-to-speech on Linux) + +### Setup + 1. **Clone the repository**: - ```bash - git clone https://github.com/yourusername/agi-pipeline.git - ``` - -2. **Navigate to the project directory**: - ```bash - cd agi-pipeline - ``` - -3. **Create and activate a virtual environment**: - ```bash - python -m venv venv - source venv/bin/activate # On Windows use `venv\Scripts\activate` - ``` - -4. **Install dependencies**: - ```bash - pip install -r requirements.txt - ``` + ```bash + git clone https://github.com/OneFineStarstuff/AGI-Pipeline.git + cd AGI-Pipeline + ``` + +2. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +3. **Environment Variables**: + Set a secure secret key for JWT: + ```bash + export SECRET_KEY="your-very-secure-secret-key" + ``` ## Usage 1. **Run the FastAPI application**: - ```bash - uvicorn main:app --reload - ``` - -2. **Access the API** at `http://127.0.0.1:8000`. + ```bash + uvicorn main:app --reload + ``` + +2. **Access the API Documentation**: + Navigate to `http://localhost:8000/docs` for the interactive Swagger UI. + +### Using Docker + +1. **Build the image**: + ```bash + docker build -t agi-pipeline . + ``` + +2. **Run the container**: + ```bash + docker run -p 8000:8000 -e SECRET_KEY="your-secure-key" agi-pipeline + ``` + +## API Endpoints + +- `POST /process-nlp/`: Generate text responses. +- `POST /process-cv-detection/`: Detect objects in an uploaded image. +- `POST /speech-to-text/`: Transcribe uploaded audio files. +- `POST /text-to-speech/`: Synthesize text into speech. + +*Note: All endpoints require a valid JWT bearer token.* -## Using Docker +## Citation -1. **Build the Docker image**: - ```bash - docker build -t agi-pipeline:1.0.1 . - ``` +If you use this software in your research, please cite it as follows: -2. **Run the Docker container**: - ```bash - docker run -p 8000:8000 agi-pipeline:1.0.1 - ``` +```bibtex +@software{Tun_AGI-Pipeline_2024, + author = {Tun, Kyaw T.}, + doi = {10.5281/zenodo.14504697}, + month = dec, + title = {{AGI-Pipeline}}, + url = {https://github.com/OneFineStarstuff/AGI-Pipeline}, + version = {1.0.1}, + year = {2024} +} +``` ## Contributing -Feel free to open issues or submit pull requests! +Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests. ## License -This project is licensed under the MIT License - see the LICENSE file for details. +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc index 579796c..dd36a4a 100644 Binary files a/__pycache__/main.cpython-312.pyc and b/__pycache__/main.cpython-312.pyc differ diff --git a/__pycache__/test_cv_module.cpython-312-pytest-9.0.2.pyc b/__pycache__/test_cv_module.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..a95d5e6 Binary files /dev/null and b/__pycache__/test_cv_module.cpython-312-pytest-9.0.2.pyc differ diff --git a/__pycache__/test_main.cpython-312-pytest-9.0.2.pyc b/__pycache__/test_main.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..f06eaea Binary files /dev/null and b/__pycache__/test_main.cpython-312-pytest-9.0.2.pyc differ diff --git a/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc b/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc index 1033710..bdffb14 100644 Binary files a/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc and b/__pycache__/test_main.cpython-312-pytest-9.0.3.pyc differ diff --git a/__pycache__/test_nlp_module.cpython-312-pytest-9.0.2.pyc b/__pycache__/test_nlp_module.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..09f2a8f Binary files /dev/null and b/__pycache__/test_nlp_module.cpython-312-pytest-9.0.2.pyc differ diff --git a/__pycache__/test_speech_processor.cpython-312-pytest-9.0.2.pyc b/__pycache__/test_speech_processor.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..e8c57e1 Binary files /dev/null and b/__pycache__/test_speech_processor.cpython-312-pytest-9.0.2.pyc differ diff --git a/codemeta.json b/codemeta.json index 3eb224f..8cd8cce 100644 --- a/codemeta.json +++ b/codemeta.json @@ -2,7 +2,7 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "name": "AGI-Pipeline", - "version": "1.0.0", + "version": "1.0.1", "dateCreated": "2024-12-16", "dateModified": "2024-12-17", "datePublished": "2024-12-17", @@ -10,23 +10,14 @@ "Artificial General Intelligence (AGI)", "Natural Language Processing (NLP)", "Computer Vision (CV)", - "Multi-Modal Processing", - "Reinforcement Learning (RL)", - "Real-Time Video Processing", - "Data Augmentation", - "Speech Recognition", - "Text-to-Speech", - "Machine Learning (ML)", - "Data Science", - "AI Pipeline", - "Deep Learning", - "Model Integration", - "Cloud Deployment", - "Interactive Visualization", - "Voice Processing", - "AI Applications", + "Speech Processing", + "Whisper", + "YOLOv8", + "FLAN-T5", + "FastAPI", "Docker", - "FastAPI" + "JWT", + "EAIP" ], "license": "MIT", "repository-code": "https://github.com/OneFineStarstuff/AGI-Pipeline", @@ -38,6 +29,6 @@ "orcid": "https://orcid.org/0009-0003-9861-5125" }, "message": "If you use this software, please cite it using the metadata from this file.", - "abstract": "The AGI (Artificial General Intelligence) Pipeline is a comprehensive and modular software framework designed to integrate various AI capabilities, including Natural Language Processing (NLP), Computer Vision (CV), Multi-Modal Processing, Reinforcement Learning (RL), and Real-Time Video Processing. This pipeline leverages state-of-the-art models and techniques to provide a robust and scalable solution for diverse AI tasks.\n\nThe AGI Pipeline is built to facilitate seamless integration and interaction between different AI modules, enabling the development of sophisticated AI applications. Key features of the pipeline include:\n\n1. Natural Language Processing (NLP):\n - Utilizes the BART (Bidirectional and Auto-Regressive Transformers) model for text summarization and other NLP tasks.\n - Provides efficient and accurate text processing capabilities.\n\n2. Computer Vision (CV):\n - Employs the ResNet50 model for image classification, leveraging pre-trained weights from ImageNet.\n - Supports advanced data augmentation techniques using the Albumentations library to enhance model robustness.\n\n3. Multi-Modal Processing:\n - Integrates the CLIP (Contrastive Language–Image Pretraining) model to process and understand text and image inputs simultaneously.\n - Enables tasks such as image captioning and scene understanding.\n\n4. Reinforcement Learning (RL):\n - Implements the PPO (Proximal Policy Optimization) algorithm from the Stable-Baselines3 library for training RL agents.\n - Includes a custom environment for RL tasks, allowing for flexible and dynamic training scenarios.\n\n5. Real-Time Video Processing:\n - Supports real-time video processing using OpenCV, enabling live video feed analysis and processing.\n - Provides a robust framework for handling real-time data streams.\n\n6. Voice and Speech Integration:\n - Incorporates speech-to-text and text-to-speech capabilities using libraries like Google Speech Recognition and pyttsx3.\n - Facilitates voice-based interactions and processing.\n\n7. Interactive Visualization:\n - Utilizes Plotly for dynamic and interactive data visualization, creating insightful visual representations of data and model performance.\n\n8. Deployment and Scalability:\n - Designed for easy deployment to cloud platforms such as AWS, GCP, and Heroku.\n - Ensures scalability and performance optimization for handling large-scale AI tasks.\n\n9. Comprehensive Testing and Validation:\n - Implements unit tests and integration tests using PyTest to ensure the robustness and reliability of the pipeline.\n\n10. User Interface:\n - Provides a web-based user interface using frameworks like Flask and React for easy interaction with the pipeline.\n\nThe AGI Pipeline is a versatile and powerful tool for researchers, developers, and AI enthusiasts, enabling the creation of advanced AI applications with ease and efficiency.", + "abstract": "The AGI (Artificial General Intelligence) Pipeline is an enterprise-grade multimodal framework designed to integrate core AI capabilities, including Natural Language Processing (NLP), Computer Vision (CV), and Speech Processing. Key features include high-quality text generation with FLAN-T5, real-time object detection with YOLOv8, high-accuracy speech-to-text with Whisper, and offline text-to-speech with Pyttsx3. The system is built with FastAPI, secured with JWT authentication, and adheres to the Enterprise AI Agent Interoperability Protocol (EAIP).", "type": "software" } diff --git a/main.py b/main.py index 34bd7b4..6b5adcf 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,34 @@ -import os -import torch +""" +AGI Pipeline Module + +This module integrates NLP, Computer Vision, and Speech Processing into a +multimodal AGI pipeline using FastAPI. +""" + import asyncio +import io +import os +import signal +import sys from typing import List -from PIL import Image -from fastapi import FastAPI, UploadFile, Depends, HTTPException -from fastapi.security import OAuth2PasswordBearer -from pydantic import BaseModel + import jwt import pyttsx3 -from loguru import logger -import io +import torch import uvicorn -import signal -import sys -from transformers import T5Tokenizer, T5ForConditionalGeneration -from ultralytics import YOLO import whisper +from fastapi import Depends, FastAPI, HTTPException, UploadFile +from fastapi.security import OAuth2PasswordBearer +from loguru import logger +from PIL import Image +from pydantic import BaseModel +from transformers import T5ForConditionalGeneration, T5Tokenizer +from ultralytics import YOLO # === Configuration and Logging Setup === device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -logger.add("pipeline_{time}.log", rotation="1 MB", level="DEBUG", enqueue=True, backtrace=True, diagnose=True) +logger.add("pipeline_{time}.log", rotation="1 MB", level="DEBUG", enqueue=True, + backtrace=True, diagnose=True) logger.info("Application startup") # === Security Setup === @@ -27,28 +36,42 @@ ALGORITHM = "HS256" oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") + def create_access_token(data: dict): + """ + Creates a JWT access token. + """ to_encode = data.copy() encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM) return encoded_jwt + def authenticate_user(token: str = Depends(oauth2_scheme)): + """ + Authenticates a user via JWT token. + """ try: payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) - except jwt.PyJWTError: + except jwt.PyJWTError as exc: logger.warning("Authentication failed.") - raise HTTPException(status_code=401, detail="Invalid token") + raise HTTPException(status_code=401, detail="Invalid token") from exc return payload + # === Pydantic Models === class TextRequest(BaseModel): + """Request model for text-based endpoints.""" text: str + class TextResponse(BaseModel): + """Response model for text-based endpoints.""" response: str + # === NLP Module (T5 Transformer) === class NLPModule: + """Module for Natural Language Processing using T5.""" def __init__(self): model_name = "google/flan-t5-small" self.tokenizer = T5Tokenizer.from_pretrained(model_name) @@ -56,6 +79,7 @@ def __init__(self): logger.info("NLP model loaded successfully.") def generate_text(self, prompt: str) -> str: + """Generates a text response for a given prompt.""" if not prompt.strip(): raise ValueError("Prompt cannot be empty.") logger.debug(f"Generating text for prompt: {prompt}") @@ -66,98 +90,132 @@ def generate_text(self, prompt: str) -> str: logger.info(f"Generated response: {response}") return response + # === CV Module (YOLOv8 for Object Detection) === class CVModule: + """Module for Computer Vision using YOLOv8.""" def __init__(self): self.model = YOLO('yolov8n.pt').to(device) logger.info("CV model loaded successfully.") def detect_objects(self, image: Image.Image) -> str: + """Detects objects in the provided image.""" logger.debug("Detecting objects in the image.") results = self.model(image) return results.pandas().xyxy[0].to_json() -# === Speech Processor (Whisper for Speech-to-Text, PyTTSX3 for Text-to-Speech) === + +# === Speech Processor === class SpeechProcessor: + """Module for processing speech-to-text and text-to-speech.""" def __init__(self): self.whisper_model = whisper.load_model("base") self.tts = pyttsx3.init() logger.info("Speech processor initialized successfully.") def speech_to_text(self, audio_file: UploadFile) -> str: + """Converts audio input to text using Whisper.""" with audio_file.file as audio_data: result = self.whisper_model.transcribe(audio_data) return result['text'] def text_to_speech(self, text: str) -> None: + """Synthesizes text into speech using Pyttsx3.""" if not text.strip(): raise ValueError("Text cannot be empty.") self.tts.say(text) self.tts.runAndWait() def __del__(self): - self.tts.stop() + if hasattr(self, "tts"): + self.tts.stop() + # === Enhanced AGI Pipeline === class EnhancedAGIPipeline: + """Pipeline orchestrator for multimodal AGI tasks.""" def __init__(self): self.nlp = NLPModule() self.cv = CVModule() self.speech_processor = SpeechProcessor() async def process_nlp(self, text: str) -> str: + """Asynchronously processes NLP requests.""" return await asyncio.to_thread(self.nlp.generate_text, text) async def process_cv(self, image: Image.Image) -> str: + """Asynchronously processes CV requests.""" return await asyncio.to_thread(self.cv.detect_objects, image) async def process_speech_to_text(self, audio_file: UploadFile) -> str: + """Asynchronously processes speech-to-text requests.""" return await asyncio.to_thread(self.speech_processor.speech_to_text, audio_file) async def process_text_to_speech(self, text: str) -> None: + """Asynchronously processes text-to-speech requests.""" await asyncio.to_thread(self.speech_processor.text_to_speech, text) + # === FastAPI Application === app = FastAPI() pipeline = EnhancedAGIPipeline() + # === Graceful Shutdown === def shutdown_signal_handler(sig, frame): + """Handles system signals for graceful shutdown.""" + # pylint: disable=unused-argument print('Shutting down gracefully...') sys.exit(0) + signal.signal(signal.SIGINT, shutdown_signal_handler) signal.signal(signal.SIGTERM, shutdown_signal_handler) + # === Endpoints === -@app.post("/process-nlp/", response_model=TextResponse, dependencies=[Depends(authenticate_user)]) +@app.post("/process-nlp/", response_model=TextResponse, + dependencies=[Depends(authenticate_user)]) async def process_nlp(request: TextRequest): + """Endpoint for generating text responses.""" response = await pipeline.process_nlp(request.text) return {"response": response} -@app.post("/process-cv-detection/", dependencies=[Depends(authenticate_user)]) + +@app.post("/process-cv-detection/", + dependencies=[Depends(authenticate_user)]) async def process_cv_detection(file: UploadFile): + """Endpoint for object detection in images.""" image = Image.open(io.BytesIO(await file.read())) response = await pipeline.process_cv(image) return {"detections": response} -@app.post("/batch-cv-detection/", dependencies=[Depends(authenticate_user)]) + +@app.post("/batch-cv-detection/", + dependencies=[Depends(authenticate_user)]) async def batch_cv_detection(files: List[UploadFile]): + """Endpoint for batch object detection in images.""" tasks = [pipeline.process_cv(Image.open(io.BytesIO(await file.read()))) for file in files] responses = await asyncio.gather(*tasks) return {"batch_detections": responses} -@app.post("/speech-to-text/", response_model=TextResponse, dependencies=[Depends(authenticate_user)]) + +@app.post("/speech-to-text/", response_model=TextResponse, + dependencies=[Depends(authenticate_user)]) async def speech_to_text(file: UploadFile): + """Endpoint for speech-to-text transcription.""" response = await pipeline.process_speech_to_text(file) return {"response": response} + @app.post("/text-to-speech/", dependencies=[Depends(authenticate_user)]) async def text_to_speech(request: TextRequest): + """Endpoint for text-to-speech synthesis.""" await pipeline.process_text_to_speech(request.text) return {"response": "Speech synthesis complete."} + # === Run the Application === if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/pipeline_2026-06-03_03-25-18_739546.log b/pipeline_2026-06-03_03-25-18_739546.log new file mode 100644 index 0000000..74b18cc --- /dev/null +++ b/pipeline_2026-06-03_03-25-18_739546.log @@ -0,0 +1,3 @@ +2026-06-03 03:25:18.778 | INFO | main::23 - Application startup +2026-06-03 03:25:34.987 | INFO | main:__init__:56 - NLP model loaded successfully. +2026-06-03 03:25:35.494 | INFO | main:__init__:73 - CV model loaded successfully. diff --git a/pipeline_2026-06-03_03-27-31_929891.log b/pipeline_2026-06-03_03-27-31_929891.log new file mode 100644 index 0000000..2b58418 --- /dev/null +++ b/pipeline_2026-06-03_03-27-31_929891.log @@ -0,0 +1,3 @@ +2026-06-03 03:27:31.941 | INFO | main::23 - Application startup +2026-06-03 03:27:33.073 | INFO | main:__init__:56 - NLP model loaded successfully. +2026-06-03 03:27:33.145 | INFO | main:__init__:73 - CV model loaded successfully. diff --git a/pipeline_2026-06-03_03-29-48_052296.log b/pipeline_2026-06-03_03-29-48_052296.log new file mode 100644 index 0000000..c6e8610 --- /dev/null +++ b/pipeline_2026-06-03_03-29-48_052296.log @@ -0,0 +1,4 @@ +2026-06-03 03:29:48.063 | INFO | main::23 - Application startup +2026-06-03 03:29:48.066 | INFO | main:__init__:56 - NLP model loaded successfully. +2026-06-03 03:29:48.067 | INFO | main:__init__:73 - CV model loaded successfully. +2026-06-03 03:29:48.067 | INFO | main:__init__:85 - Speech processor initialized successfully. diff --git a/pipeline_2026-06-03_03-30-37_836512.log b/pipeline_2026-06-03_03-30-37_836512.log new file mode 100644 index 0000000..677a78e --- /dev/null +++ b/pipeline_2026-06-03_03-30-37_836512.log @@ -0,0 +1,4 @@ +2026-06-03 03:30:37.847 | INFO | main::23 - Application startup +2026-06-03 03:30:37.850 | INFO | main:__init__:56 - NLP model loaded successfully. +2026-06-03 03:30:37.852 | INFO | main:__init__:73 - CV model loaded successfully. +2026-06-03 03:30:37.852 | INFO | main:__init__:85 - Speech processor initialized successfully. diff --git a/pipeline_2026-06-03_03-32-15_069311.log b/pipeline_2026-06-03_03-32-15_069311.log new file mode 100644 index 0000000..c182467 --- /dev/null +++ b/pipeline_2026-06-03_03-32-15_069311.log @@ -0,0 +1,4 @@ +2026-06-03 03:32:15.080 | INFO | main::23 - Application startup +2026-06-03 03:32:15.083 | INFO | main:__init__:56 - NLP model loaded successfully. +2026-06-03 03:32:15.084 | INFO | main:__init__:73 - CV model loaded successfully. +2026-06-03 03:32:15.085 | INFO | main:__init__:85 - Speech processor initialized successfully. diff --git a/pipeline_2026-06-03_03-40-47_295812.log b/pipeline_2026-06-03_03-40-47_295812.log new file mode 100644 index 0000000..a627469 --- /dev/null +++ b/pipeline_2026-06-03_03-40-47_295812.log @@ -0,0 +1,4 @@ +2026-06-03 03:40:47.306 | INFO | main::32 - Application startup +2026-06-03 03:40:47.309 | INFO | main:__init__:79 - NLP model loaded successfully. +2026-06-03 03:40:47.313 | INFO | main:__init__:99 - CV model loaded successfully. +2026-06-03 03:40:47.313 | INFO | main:__init__:114 - Speech processor initialized successfully. diff --git a/pyproject.toml b/pyproject.toml index c3140cb..a84243f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "torch", "transformers", "Pillow", - "whisper", + "openai-whisper", "ultralytics", "pyttsx3", "loguru", diff --git a/requirements.txt b/requirements.txt index 4e92656..5177bb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ uvicorn torch transformers Pillow -whisper +openai-whisper ultralytics pyttsx3 loguru diff --git a/test_main.py b/test_main.py deleted file mode 100644 index b371435..0000000 --- a/test_main.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -from main import app, EnhancedAGIPipeline -from fastapi.testclient import TestClient - -class TestMain(unittest.TestCase): - def setUp(self): - self.client = TestClient(app) - self.pipeline = EnhancedAGIPipeline() - - def test_process_nlp(self): - response = self.client.post("/process-nlp/", json={"text": "Hello world"}) - self.assertEqual(response.status_code, 200) - self.assertIn("response", response.json()) - - def test_process_cv_detection(self): - with open("test_image.jpg", "rb") as image: - response = self.client.post("/process-cv-detection/", files={"file": ("filename", image, "image/jpeg")}) - self.assertEqual(response.status_code, 200) - self.assertIn("detections", response.json()) - - def test_speech_to_text(self): - with open("test_audio.wav", "rb") as audio: - response = self.client.post("/speech-to-text/", files={"file": ("filename", audio, "audio/wav")}) - self.assertEqual(response.status_code, 200) - self.assertIn("response", response.json()) - - def test_text_to_speech(self): - response = self.client.post("/text-to-speech/", json={"text": "Hello world"}) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.json(), {"response": "Speech synthesis complete."}) - -if __name__ == '__main__': - unittest.main() diff --git a/yolov8n.pt b/yolov8n.pt new file mode 100644 index 0000000..0db4ca4 Binary files /dev/null and b/yolov8n.pt differ