From 2b08bc3e3b7c3843eb9fe72cc5e8dd1640cb47b4 Mon Sep 17 00:00:00 2001 From: mbv06 Date: Thu, 11 Jun 2026 16:19:35 +0300 Subject: [PATCH 1/7] Support JPEG documents as image inputs --- app/main_bot.py | 138 +++++++----------------------------------------- 1 file changed, 20 insertions(+), 118 deletions(-) diff --git a/app/main_bot.py b/app/main_bot.py index 23b4baa..f6ca529 100644 --- a/app/main_bot.py +++ b/app/main_bot.py @@ -1479,116 +1479,9 @@ async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE print(f"User {user_id} has {len(photos)} photos in media group") - if media_group_captions[user_id] == None: - caption = "Describe what is in this image in user language." - describe_question = caption - else: - caption = media_group_captions[user_id] - describe_question = f"Describe what is in this image and answer to this question: {caption}" - - - # Send initial status message - status_message = await update.message.reply_text("🖼️ *Analyzing images...*", parse_mode="markdown") - - temp_photos = [] # Keep track of temporary files for cleanup - all_descriptions = [] # Store descriptions for all photos - image_paths = [] # Store paths to downloaded images - - try: - # Process each photo - for i, photo_group in enumerate(photos, 1): - try: - photo = photo_group[0] # Get the photo from the group - # Download the photo - photo_file = await context.bot.get_file(photo.file_id) - temp_dir = f"./data/{user_id}/temp_photos" - os.makedirs(temp_dir, exist_ok=True) - temp_photo = os.path.join(temp_dir, f"photo_{uuid.uuid4()}.jpg") - temp_photos.append(temp_photo) - await photo_file.download_to_drive(temp_photo) - - # Save the permanent copy to user's directory - user_images_dir = os.path.join("data", user_id, "images") - os.makedirs(user_images_dir, exist_ok=True) - permanent_image_path = os.path.join(user_images_dir, f"image_{uuid.uuid4()}.jpg") - # Copy the image to the permanent location - shutil.copy(temp_photo, permanent_image_path) - image_paths.append(permanent_image_path) - - # Get descriptions from both services - await status_message.edit_text(f"🤖 *Getting Anthropic description for image {i}...*", parse_mode="markdown") - anthropic_description = await describe_image_anthropic(question=describe_question, image_path=temp_photo) - stats_tracker.track_describe_used(user_id, "image_anthropic") - - await status_message.edit_text(f"🤖 *Getting OpenAI description for image {i}...*", parse_mode="markdown") - openai_description = await describe_image_openai(question=describe_question, image_path=temp_photo) - stats_tracker.track_describe_used(user_id, "image_openai") - - all_descriptions.append({ - 'anthropic': anthropic_description, - 'openai': openai_description, - 'path': permanent_image_path - }) - - except Exception as e: - logging.error(f"Error processing photo {i}: {str(e)}") - all_descriptions.append({ - 'anthropic': f"Error processing image {i}", - 'openai': f"Error processing image {i}", - 'path': "error_path" - }) - - # Craft the user question combining caption and all descriptions - descriptions_text = "\n\n".join([ - f"Image {i+1} (path: {desc['path']}):\n" - f"Anthropic description: {desc['anthropic']}\n" - f"OpenAI description: {desc['openai']}" - for i, desc in enumerate(all_descriptions) - ]) - - user_question = f"{caption}\n\nUser attached {len(all_descriptions)} image(s) to this message. Here are the details about each image from Anthropic and OpenAI:\n\n{descriptions_text}" - - await status_message.edit_text("🤖 *Processing...*", parse_mode="markdown") - # Process like a regular message - await context.bot.send_chat_action(chat_id=update.message.chat_id, action='typing', message_thread_id=get_thread_id(update)) - thinking_message = await update.message.reply_text("💭 *Thinking...*", parse_mode="markdown") - _, _, mg_limit = check_user_limits(user_id) - - async def update_thinking_message(step: str, details: str, iteration: int, critique: int): - if step == "saving": - iteration = "final" - critique = "end" - live_limit_info = "" - if mg_limit: - live_used = stats_tracker.get_user_action_count(user_id, days=30) - live_limit_info = f"📊 *Usage:* _{live_used}/{mg_limit} actions (30d)_\n" - await thinking_message.edit_text( - f"💭 *Thinking...*\n" - f"- - - - \n" - f"{live_limit_info}" - f"📝 *Step:* _{step.replace('_', '-')}_\n" - f"📋 *Details:* _{details.replace('_', '-')}_\n" - f"🔄 *Iterations:* _{iteration}_\n" - f"🎯 *Critiques:* _{critique}_", - parse_mode="markdown" - ) - - # Get response using the same logic as handle_message - _thread_id = get_thread_id(update) - on_text_chunk = create_streaming_callback(context.bot, user_id, _thread_id) - response, messages = await get_answer(user_question, user_id, update_thinking_message, update, context, on_text_chunk=on_text_chunk, message_thread_id=_thread_id) - await send_response_to_user(update, thinking_message, response, user_id) - await send_reasoning_file(update, messages, user_id) - await status_message.edit_text("🤖 *Done!*", parse_mode="markdown") - - finally: - # Clean up all temporary files - for temp_photo in temp_photos: - try: - if os.path.exists(temp_photo): - os.remove(temp_photo) - except Exception as e: - print(f"Error cleaning up temporary photo file {temp_photo}: {str(e)}") + image_refs = [photo_group[0] for photo_group in photos] + _, _, mg_limit = check_user_limits(user_id) + await process_image_message(update, context, user_id, image_refs, media_group_captions[user_id], mg_limit) finally: # Clean up media group data media_group_id[user_id] = None @@ -1646,8 +1539,7 @@ async def handle_photo_message(update: Update, context: ContextTypes.DEFAULT_TYP async def process_media_group_with_timeout(): try: await asyncio.sleep(MEDIA_GROUP_TIMEOUT) - async with get_user_lock(user_id): - await process_media_group(update, context, user_id) + await process_media_group(update, context, user_id) except asyncio.CancelledError: pass except Exception as e: @@ -1658,13 +1550,16 @@ async def process_media_group_with_timeout(): return # Handle single photo message + await process_image_message(update, context, user_id, [update.message.photo[-1]], update.message.caption, limit) + +async def process_image_message(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_refs: list, caption: str, limit: int | None = None): + """Process one or more Telegram image file refs through the photo analysis pipeline.""" async with get_user_lock(user_id): - photos = [update.message.photo[-1]] - if update.message.caption == None: + photos = image_refs + if caption is None: caption = "Describe what is in this image in user language." describe_question = caption else: - caption = update.message.caption describe_question = f"Describe what is in this image and answer to this question: {caption}" status_message = await update.message.reply_text("🖼️ *Analyzing images...*", parse_mode="markdown") @@ -1689,11 +1584,12 @@ async def process_media_group_with_timeout(): shutil.copy(temp_photo, permanent_image_path) image_paths.append(permanent_image_path) - await status_message.edit_text(f"🤖 *Getting Anthropic description...*", parse_mode="markdown") + image_suffix = f" for image {i}" if len(photos) > 1 else "" + await status_message.edit_text(f"🤖 *Getting Anthropic description{image_suffix}...*", parse_mode="markdown") anthropic_description = await describe_image_anthropic(question=describe_question, image_path=temp_photo) stats_tracker.track_describe_used(user_id, "image_anthropic") - await status_message.edit_text(f"🤖 *Getting OpenAI description...*", parse_mode="markdown") + await status_message.edit_text(f"🤖 *Getting OpenAI description{image_suffix}...*", parse_mode="markdown") openai_description = await describe_image_openai(question=describe_question, image_path=temp_photo) stats_tracker.track_describe_used(user_id, "image_openai") @@ -1787,6 +1683,12 @@ async def handle_document_message(update: Update, context: ContextTypes.DEFAULT_ await handle_video_message(update, context) return + image_extensions = ['.jpg', '.jpeg'] + if file_extension in image_extensions: + stats_tracker.track_message_received(user_id, "photo") + await process_image_message(update, context, user_id, [update.message.document], update.message.caption, limit) + return + # Track message received stats_tracker.track_message_received(user_id, "document") @@ -1794,7 +1696,7 @@ async def handle_document_message(update: Update, context: ContextTypes.DEFAULT_ if file_extension not in supported_extensions: supported_formats = ", ".join([ext.replace(".", "").upper() for ext in supported_extensions]) - await update.message.reply_text(f"❌ Only {supported_formats} documents are supported.") + await update.message.reply_text(f"❌ Only {supported_formats} documents and JPG/JPEG images are supported.") return async with get_user_lock(user_id): From 20fd4c4391857d022f5eebaf29c8fb7ca42b73aa Mon Sep 17 00:00:00 2001 From: mbv06 Date: Thu, 11 Jun 2026 19:17:34 +0300 Subject: [PATCH 2/7] Handle grouped JPEG image documents --- .env.example | 3 ++ app/main_bot.py | 115 ++++++++++++++++++++++++++++++------------------ 2 files changed, 75 insertions(+), 43 deletions(-) diff --git a/.env.example b/.env.example index c4e2000..45f2758 100644 --- a/.env.example +++ b/.env.example @@ -42,3 +42,6 @@ TELEGRAM_API_ID=your_telegram_api_id_here TELEGRAM_API_HASH=your_telegram_api_hash_here TELEGRAM_USE_LOCAL_API=true TELEGRAM_LOCAL_API_URL=http://localhost:8081 + +# Image Processing Configuration +MAX_IMAGE_SIZE=1024 diff --git a/app/main_bot.py b/app/main_bot.py index f6ca529..67ed7ec 100644 --- a/app/main_bot.py +++ b/app/main_bot.py @@ -22,6 +22,7 @@ from secure_container.main import initialize_secure_containers, cleanup_containers from stats import stats_tracker, DEFAULT_ACTION_LIMIT import time +from PIL import Image # Streaming config - read directly from env, not imported streaming_enabled = os.getenv("STREAMING_ENABLED", "false").lower() == "true" @@ -85,6 +86,7 @@ def get_prober_name(): anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") anthropic_client = anthropic.AsyncAnthropic(api_key=anthropic_api_key) send_reasoning = True +MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024") or "1024") user_invites = {} authorized_users = set(telegram_chat_id) @@ -249,8 +251,31 @@ def validate_semantic_max_results(self, max_results: int) -> int: return max(1, min(20, max_results)) def encode_image(image_path): - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode('utf-8') + image_bytes = b"" + try: + with Image.open(image_path) as img: + needs_resizing = max(img.size) > MAX_IMAGE_SIZE + is_jpeg = (img.format or "").upper() in ("JPEG", "JPG") + + if needs_resizing or not is_jpeg: + if needs_resizing: + resample_filter = getattr(Image, 'Resampling', Image).LANCZOS + img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), resample_filter) + + buffer = io.BytesIO() + if img.mode != "RGB": + img = img.convert("RGB") + img.save(buffer, format="JPEG", quality=85) + image_bytes = buffer.getvalue() + except Exception as e: + logger.warning(f"Error compressing image: {e}. Falling back to original size.") + + if not image_bytes: + with open(image_path, "rb") as image_file: + image_bytes = image_file.read() + + + return base64.b64encode(image_bytes).decode('utf-8') def split_text_intelligently(text: str, max_length: int = 4000) -> list[str]: @@ -1462,24 +1487,59 @@ async def voice_button(update: Update, context: ContextTypes.DEFAULT_TYPE): parse_mode="markdown" ) +async def queue_image_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_ref, caption: str | None): + """Collect Telegram album image refs and process them after updates stop arriving.""" + if user_id in media_group_tasks and media_group_tasks[user_id]: + media_group_tasks[user_id].cancel() + + if user_id not in media_group_id or media_group_id[user_id] != update.message.media_group_id: + media_group_id[user_id] = update.message.media_group_id + media_group_photos[user_id] = [] + media_group_captions[user_id] = None + media_group_waiting_message[user_id] = await update.message.reply_text( + "🖼️ *Image media group received... Waiting for images...*", + parse_mode="markdown" + ) + + if caption and media_group_captions[user_id] is None: + media_group_captions[user_id] = caption + + if image_ref not in media_group_photos[user_id]: + media_group_photos[user_id].append(image_ref) + await media_group_waiting_message[user_id].edit_text( + f"🖼️ *Image {len(media_group_photos[user_id])} received... Waiting for other images...*", + parse_mode="markdown" + ) + + async def process_media_group_with_timeout(): + try: + await asyncio.sleep(MEDIA_GROUP_TIMEOUT) + await process_media_group(update, context, user_id) + except asyncio.CancelledError: + pass + except Exception as e: + logging.error(f"Error in process_media_group_with_timeout: {str(e)}") + + task = asyncio.create_task(process_media_group_with_timeout()) + media_group_tasks[user_id] = task + async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str): - """Process all photos in a media group""" + """Process all images in a media group""" try: - if not user_id in media_group_photos: + if user_id not in media_group_photos or not media_group_photos[user_id]: print(f"User {user_id} is waiting for media group, but it is not in media_group_photos") return if user_id in media_group_waiting_message and media_group_waiting_message[user_id]: await media_group_waiting_message[user_id].edit_text("🖼️ *Processing media group...*", parse_mode="markdown") - photos = media_group_photos[user_id] + image_refs = media_group_photos[user_id] # Track media group processed - stats_tracker.track_media_group_processed(user_id, len(photos)) + stats_tracker.track_media_group_processed(user_id, len(image_refs)) - print(f"User {user_id} has {len(photos)} photos in media group") + print(f"User {user_id} has {len(image_refs)} images in media group") - image_refs = [photo_group[0] for photo_group in photos] _, _, mg_limit = check_user_limits(user_id) await process_image_message(update, context, user_id, image_refs, media_group_captions[user_id], mg_limit) finally: @@ -1512,41 +1572,7 @@ async def handle_photo_message(update: Update, context: ContextTypes.DEFAULT_TYP stats_tracker.track_message_received(user_id, "photo") if update.message.media_group_id: - # Media group collection stays outside lock - multiple photos arrive rapidly - if user_id in media_group_tasks and media_group_tasks[user_id]: - media_group_tasks[user_id].cancel() - - if user_id not in media_group_id or media_group_id[user_id] != update.message.media_group_id: - media_group_id[user_id] = update.message.media_group_id - media_group_photos[user_id] = [] - media_group_captions[user_id] = None - media_group_waiting_message[user_id] = await update.message.reply_text( - "🖼️ *Image media group received... Waiting for images...*", - parse_mode="markdown" - ) - - if update.message.caption and media_group_captions[user_id] == None: - media_group_captions[user_id] = update.message.caption - - photos = [update.message.photo[-1]] - if photos not in media_group_photos[user_id]: - media_group_photos[user_id].append(photos) - await media_group_waiting_message[user_id].edit_text( - f"🖼️ *Image {len(media_group_photos[user_id])} received... Waiting for other images...*", - parse_mode="markdown" - ) - - async def process_media_group_with_timeout(): - try: - await asyncio.sleep(MEDIA_GROUP_TIMEOUT) - await process_media_group(update, context, user_id) - except asyncio.CancelledError: - pass - except Exception as e: - logging.error(f"Error in process_media_group_with_timeout: {str(e)}") - - task = asyncio.create_task(process_media_group_with_timeout()) - media_group_tasks[user_id] = task + await queue_image_media_group(update, context, user_id, update.message.photo[-1], update.message.caption) return # Handle single photo message @@ -1686,6 +1712,9 @@ async def handle_document_message(update: Update, context: ContextTypes.DEFAULT_ image_extensions = ['.jpg', '.jpeg'] if file_extension in image_extensions: stats_tracker.track_message_received(user_id, "photo") + if update.message.media_group_id: + await queue_image_media_group(update, context, user_id, update.message.document, update.message.caption) + return await process_image_message(update, context, user_id, [update.message.document], update.message.caption, limit) return From f7459583eaa8cfd17a170f8fe8014be97ee5ac41 Mon Sep 17 00:00:00 2001 From: mbv06 Date: Thu, 11 Jun 2026 19:18:43 +0300 Subject: [PATCH 3/7] Add audioop-lts dependency for Python 3.13 compatibility --- app/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/app/requirements.txt b/app/requirements.txt index 7a8636d..8db5241 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -8,6 +8,7 @@ boto3>=1.36.0 loguru>=0.7.0 elevenlabs>=0.3.0 pydub>=0.25.1 # winget install ffmpeg +audioop-lts>=0.2.2; python_version >= "3.13" ffmpeg-downloader # ffdl install --add-path requests>=2.31.0 beautifulsoup4>=4.12.0 From b92b21ad7c186bb071732b6e887bdeef73014b01 Mon Sep 17 00:00:00 2001 From: mbv06 Date: Fri, 12 Jun 2026 00:34:23 +0300 Subject: [PATCH 4/7] Limit image edit input resolution --- .env.example | 3 +- app/agents/image_tools.py | 76 +++++++++++++++++++++++++++++++++------ app/main_bot.py | 23 ++++++------ 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/.env.example b/.env.example index 45f2758..05f1028 100644 --- a/.env.example +++ b/.env.example @@ -44,4 +44,5 @@ TELEGRAM_USE_LOCAL_API=true TELEGRAM_LOCAL_API_URL=http://localhost:8081 # Image Processing Configuration -MAX_IMAGE_SIZE=1024 +MAX_IMAGE_RESOLUTION_VISION=1024 +MAX_IMAGE_RESOLUTION_EDIT=4096 diff --git a/app/agents/image_tools.py b/app/agents/image_tools.py index e1b9eca..a5fd804 100644 --- a/app/agents/image_tools.py +++ b/app/agents/image_tools.py @@ -10,12 +10,14 @@ from google import genai from google.genai import types import PIL.Image +import PIL.ImageOps load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") openai_client = OpenAI(api_key=openai_api_key) google_api_key = os.getenv("GOOGLE_API_KEY") genai_client = genai.Client(api_key=google_api_key) +max_image_resolution_edit = int(os.getenv("MAX_IMAGE_RESOLUTION_EDIT", "4096") or "4096") class TextPart: def __init__(self, text): @@ -273,6 +275,37 @@ def _resolve_image_path(self, image_path: str) -> Path: return relative_to_images return p # Return original so the caller can report the correct path + def _prepare_image_for_edit(self, image_path: Path, temp_paths: List[Path]) -> Path: + """Create a temporary downsized JPEG for oversized edit inputs.""" + if max_image_resolution_edit > 0: + try: + with PIL.Image.open(image_path) as img: + img = PIL.ImageOps.exif_transpose(img) + if max(img.size) > max_image_resolution_edit: + img.thumbnail( + (max_image_resolution_edit, max_image_resolution_edit), + getattr(PIL.Image, "Resampling", PIL.Image).LANCZOS + ) + if img.mode != "RGB": + img = img.convert("RGB") + + temp_path = self.images_path / f"edit_input_{uuid.uuid4()}.jpg" + temp_paths.append(temp_path) + img.save(temp_path, format="JPEG", quality=90, optimize=True) + return temp_path + except Exception as e: + print(f"Failed to prepare image for edit, using original file: {e}") + + return image_path + + def _cleanup_temp_images(self, temp_paths: List[Path]) -> None: + for temp_path in temp_paths: + try: + if temp_path.exists(): + temp_path.unlink() + except Exception as e: + print(f"Failed to remove temporary image {temp_path}: {e}") + def _resolve_gpt_size(self, resolution: str, aspect_ratio: str) -> str: """Translate resolution + aspect_ratio into a GPT Image 2 pixel size string. @@ -418,6 +451,8 @@ async def _generate_multimodal_image_and_text(self, prompt: str, style: str = "3 async def _image_editing(self, prompt: str, image_path: str, model: str = "Normal", aspect_ratio: str = "16:9", resolution: str = "2K", gpt_quality: str = "auto", variants: int = 1, caption: str = "Here is your edited image") -> str: """Edit an existing image using Gemini or GPT Image 2""" print(f"Editing image - Prompt: {prompt}, Image: {image_path}, Model: {model}, Aspect Ratio: {aspect_ratio}, Resolution: {resolution}, GPT Quality: {gpt_quality}, Variants: {variants}") + temp_paths: List[Path] = [] + source_image = None try: image_path_obj = self._resolve_image_path(image_path) if not image_path_obj.exists(): @@ -433,8 +468,9 @@ async def _image_editing(self, prompt: str, image_path: str, model: str = "Norma ) all_results.append(r) return "\n".join(all_results) - - source_image = PIL.Image.open(image_path_obj) + + source_image_path = self._prepare_image_for_edit(image_path_obj, temp_paths) + source_image = PIL.Image.open(source_image_path) model_name = "gemini-3-pro-image-preview" if model.lower() == "pro" else "gemini-3.1-flash-image-preview" @@ -465,7 +501,7 @@ async def _image_editing(self, prompt: str, image_path: str, model: str = "Norma ) contents = response.candidates[0].content.parts - + for content in contents: if 'text' in content.model_fields_set: try: @@ -490,24 +526,32 @@ async def _image_editing(self, prompt: str, image_path: str, model: str = "Norma document=file, caption=variant_caption ) - + all_saved_paths.append(str(transformed_image_path)) - + if all_saved_paths: paths_str = "\n".join(f" - {p}" for p in all_saved_paths) result_message += f"Edited {len(all_saved_paths)} image(s) and sent to user.\nSaved to:\n{paths_str}\nImage transformation completed successfully.\n" else: result_message += "Warning: No transformed image was generated. The model only provided text response.\n" - + return result_message except Exception as e: return f"Error editing image: {str(e)}" + finally: + if source_image: + source_image.close() + self._cleanup_temp_images(temp_paths) async def _gpt_image_edit(self, prompt: str, image_paths: List[Path], size: str = "2048x1152", quality: str = "auto", caption: str = "Here is your edited image") -> str: """Edit one or more images using OpenAI GPT Image 2""" print(f"Editing image(s) with GPT Image 2 - Prompt: {prompt}, Images: {image_paths}, Size: {size}, Quality: {quality}") + temp_paths: List[Path] = [] try: - image_files = [open(str(p), "rb") for p in image_paths] + image_files = [ + open(str(self._prepare_image_for_edit(p, temp_paths)), "rb") + for p in image_paths + ] kwargs = { "model": "gpt-image-2-2026-04-21", @@ -526,6 +570,7 @@ async def _gpt_image_edit(self, prompt: str, image_paths: List[Path], size: str finally: for f in image_files: f.close() + self._cleanup_temp_images(temp_paths) image_base64 = result.data[0].b64_json image_bytes = base64.b64decode(image_base64) @@ -658,6 +703,8 @@ async def _gpt_image_generate(self, prompt: str, size: str = "2048x1152", qualit async def _image_composition(self, prompt: str, image_paths: List[str], model: str = "Normal", aspect_ratio: str = "16:9", resolution: str = "2K", gpt_quality: str = "auto", variants: int = 1, caption: str = "Here is your composed image") -> str: """Compose a new image from multiple input images using Gemini or GPT Image 2""" print(f"Composing image - Prompt: {prompt}, Images: {image_paths}, Model: {model}, Aspect Ratio: {aspect_ratio}, Resolution: {resolution}, GPT Quality: {gpt_quality}, Variants: {variants}") + temp_paths: List[Path] = [] + images: List[PIL.Image.Image] = [] try: resolved_paths = [] for image_path in image_paths: @@ -683,7 +730,11 @@ async def _image_composition(self, prompt: str, image_paths: List[str], model: s if len(resolved_paths) > 3: return "Error: Maximum 3 images are supported for composition with Normal/Pro mode. Use GPT mode for more images." - images = [PIL.Image.open(p) for p in resolved_paths] + prepared_paths = [ + self._prepare_image_for_edit(p, temp_paths) + for p in resolved_paths + ] + images = [PIL.Image.open(p) for p in prepared_paths] input_contents = [] for image in images: @@ -716,7 +767,7 @@ async def _image_composition(self, prompt: str, image_paths: List[str], model: s ) response_parts = response.candidates[0].content.parts - + for part in response_parts: if 'text' in part.model_fields_set and part.text: try: @@ -753,8 +804,11 @@ async def _image_composition(self, prompt: str, image_paths: List[str], model: s result_message += f"Composed {len(all_saved_paths)} image(s) and sent to user.\nSaved to:\n{paths_str}\nImage composition completed successfully.\n" else: result_message += "Warning: No composed image was generated. The model only provided text response.\n" - + return result_message except Exception as e: return f"Error composing image: {str(e)}" - + finally: + for image in images: + image.close() + self._cleanup_temp_images(temp_paths) diff --git a/app/main_bot.py b/app/main_bot.py index 67ed7ec..4a68bb3 100644 --- a/app/main_bot.py +++ b/app/main_bot.py @@ -86,7 +86,7 @@ def get_prober_name(): anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") anthropic_client = anthropic.AsyncAnthropic(api_key=anthropic_api_key) send_reasoning = True -MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024") or "1024") +max_image_resolution_vision = int(os.getenv("MAX_IMAGE_RESOLUTION_VISION", "1024") or "1024") user_invites = {} authorized_users = set(telegram_chat_id) @@ -254,13 +254,13 @@ def encode_image(image_path): image_bytes = b"" try: with Image.open(image_path) as img: - needs_resizing = max(img.size) > MAX_IMAGE_SIZE + needs_resizing = max(img.size) > max_image_resolution_vision is_jpeg = (img.format or "").upper() in ("JPEG", "JPG") if needs_resizing or not is_jpeg: if needs_resizing: resample_filter = getattr(Image, 'Resampling', Image).LANCZOS - img.thumbnail((MAX_IMAGE_SIZE, MAX_IMAGE_SIZE), resample_filter) + img.thumbnail((max_image_resolution_vision, max_image_resolution_vision), resample_filter) buffer = io.BytesIO() if img.mode != "RGB": @@ -1487,7 +1487,7 @@ async def voice_button(update: Update, context: ContextTypes.DEFAULT_TYPE): parse_mode="markdown" ) -async def queue_image_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_ref, caption: str | None): +async def queue_image_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_ref: Any, caption: str | None): """Collect Telegram album image refs and process them after updates stop arriving.""" if user_id in media_group_tasks and media_group_tasks[user_id]: media_group_tasks[user_id].cancel() @@ -1530,9 +1530,9 @@ async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE print(f"User {user_id} is waiting for media group, but it is not in media_group_photos") return - if user_id in media_group_waiting_message and media_group_waiting_message[user_id]: - await media_group_waiting_message[user_id].edit_text("🖼️ *Processing media group...*", parse_mode="markdown") - + status_message = media_group_waiting_message.get(user_id) + media_group_waiting_message[user_id] = None + image_refs = media_group_photos[user_id] # Track media group processed @@ -1541,7 +1541,7 @@ async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE print(f"User {user_id} has {len(image_refs)} images in media group") _, _, mg_limit = check_user_limits(user_id) - await process_image_message(update, context, user_id, image_refs, media_group_captions[user_id], mg_limit) + await process_image_message(update, context, user_id, image_refs, media_group_captions[user_id], mg_limit, status_message) finally: # Clean up media group data media_group_id[user_id] = None @@ -1578,7 +1578,7 @@ async def handle_photo_message(update: Update, context: ContextTypes.DEFAULT_TYP # Handle single photo message await process_image_message(update, context, user_id, [update.message.photo[-1]], update.message.caption, limit) -async def process_image_message(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_refs: list, caption: str, limit: int | None = None): +async def process_image_message(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_refs: list, caption: str | None, limit: int | None = None, status_message: Any = None): """Process one or more Telegram image file refs through the photo analysis pipeline.""" async with get_user_lock(user_id): photos = image_refs @@ -1588,7 +1588,10 @@ async def process_image_message(update: Update, context: ContextTypes.DEFAULT_TY else: describe_question = f"Describe what is in this image and answer to this question: {caption}" - status_message = await update.message.reply_text("🖼️ *Analyzing images...*", parse_mode="markdown") + if status_message: + await status_message.edit_text("🖼️ *Analyzing images...*", parse_mode="markdown") + else: + status_message = await update.message.reply_text("🖼️ *Analyzing images...*", parse_mode="markdown") temp_photos = [] all_descriptions = [] From 2ec0621723b86a238fe11c56375664e3a91fc38b Mon Sep 17 00:00:00 2001 From: mbv06 Date: Fri, 12 Jun 2026 01:09:29 +0300 Subject: [PATCH 5/7] chore: add temporary media directories to .gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index c3bc098..431cd95 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,8 @@ __pycache__/ .DS_Store data/ .vscode/ + +# Temporary media directories +temp_photos/ +temp_docs/ +temp_audio/ From f9adc6527658ec2f32080d04bb6fc16a0cf1b3d8 Mon Sep 17 00:00:00 2001 From: mbv06 Date: Fri, 12 Jun 2026 01:35:52 +0300 Subject: [PATCH 6/7] Add HEIC and HEIF image support --- app/agents/image_tools.py | 33 ++++++++++++++++++++------------- app/image_utils.py | 37 +++++++++++++++++++++++++++++++++++++ app/main_bot.py | 24 +++++++++++++++++++++--- app/requirements.txt | 3 ++- 4 files changed, 80 insertions(+), 17 deletions(-) create mode 100644 app/image_utils.py diff --git a/app/agents/image_tools.py b/app/agents/image_tools.py index a5fd804..b8de7c8 100644 --- a/app/agents/image_tools.py +++ b/app/agents/image_tools.py @@ -11,6 +11,7 @@ from google.genai import types import PIL.Image import PIL.ImageOps +from image_utils import JPEG_FORMATS load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") @@ -276,25 +277,31 @@ def _resolve_image_path(self, image_path: str) -> Path: return p # Return original so the caller can report the correct path def _prepare_image_for_edit(self, image_path: Path, temp_paths: List[Path]) -> Path: - """Create a temporary downsized JPEG for oversized edit inputs.""" - if max_image_resolution_edit > 0: - try: - with PIL.Image.open(image_path) as img: + """Create a temporary JPEG for resized or non-JPEG edit inputs.""" + try: + with PIL.Image.open(image_path) as img: + image_format = (img.format or "").upper() + needs_resizing = max_image_resolution_edit > 0 and max(img.size) > max_image_resolution_edit + needs_conversion = image_format not in JPEG_FORMATS + + if needs_resizing or needs_conversion: img = PIL.ImageOps.exif_transpose(img) - if max(img.size) > max_image_resolution_edit: + + if needs_resizing: img.thumbnail( (max_image_resolution_edit, max_image_resolution_edit), getattr(PIL.Image, "Resampling", PIL.Image).LANCZOS ) - if img.mode != "RGB": - img = img.convert("RGB") - temp_path = self.images_path / f"edit_input_{uuid.uuid4()}.jpg" - temp_paths.append(temp_path) - img.save(temp_path, format="JPEG", quality=90, optimize=True) - return temp_path - except Exception as e: - print(f"Failed to prepare image for edit, using original file: {e}") + if img.mode != "RGB": + img = img.convert("RGB") + + temp_path = self.images_path / f"edit_input_{uuid.uuid4()}.jpg" + temp_paths.append(temp_path) + img.save(temp_path, format="JPEG", quality=90, optimize=True) + return temp_path + except Exception as e: + print(f"Failed to prepare image for edit, using original file: {e}") return image_path diff --git a/app/image_utils.py b/app/image_utils.py new file mode 100644 index 0000000..a29ab49 --- /dev/null +++ b/app/image_utils.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from PIL import Image, ImageOps +from pillow_heif import register_heif_opener + + +register_heif_opener() + +IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".heic", ".heif") +JPEG_FORMATS = {"JPEG", "JPG"} + + +def is_jpeg_image(image_path: str | Path) -> bool: + with Image.open(image_path) as img: + return (img.format or "").upper() in JPEG_FORMATS + + +def save_image_as_jpeg( + source_path: str | Path, + target_path: str | Path, + *, + max_resolution: int = 0, + quality: int = 90, +) -> None: + with Image.open(source_path) as img: + img = ImageOps.exif_transpose(img) + + if max_resolution > 0 and max(img.size) > max_resolution: + img.thumbnail( + (max_resolution, max_resolution), + getattr(Image, "Resampling", Image).LANCZOS, + ) + + if img.mode != "RGB": + img = img.convert("RGB") + + img.save(target_path, format="JPEG", quality=quality, optimize=True) diff --git a/app/main_bot.py b/app/main_bot.py index 4a68bb3..ecd9857 100644 --- a/app/main_bot.py +++ b/app/main_bot.py @@ -23,6 +23,7 @@ from stats import stats_tracker, DEFAULT_ACTION_LIMIT import time from PIL import Image +from image_utils import IMAGE_EXTENSIONS, is_jpeg_image, save_image_as_jpeg # Streaming config - read directly from env, not imported streaming_enabled = os.getenv("STREAMING_ENABLED", "false").lower() == "true" @@ -278,6 +279,14 @@ def encode_image(image_path): return base64.b64encode(image_bytes).decode('utf-8') +def prepare_downloaded_image_for_vision(source_path: str, target_path: str) -> None: + if is_jpeg_image(source_path): + shutil.copy(source_path, target_path) + return + + save_image_as_jpeg(source_path, target_path, quality=90) + + def split_text_intelligently(text: str, max_length: int = 4000) -> list[str]: """ Split text into chunks with a maximum length, trying to split at natural boundaries: @@ -1603,9 +1612,17 @@ async def process_image_message(update: Update, context: ContextTypes.DEFAULT_TY photo_file = await context.bot.get_file(photo.file_id) temp_dir = "temp_photos" os.makedirs(temp_dir, exist_ok=True) + original_extension = os.path.splitext(getattr(photo, "file_name", "") or "")[1].lower() + if original_extension not in IMAGE_EXTENSIONS: + original_extension = ".jpg" + + temp_download = os.path.join(temp_dir, f"photo_{uuid.uuid4()}{original_extension}") + temp_photos.append(temp_download) + await photo_file.download_to_drive(temp_download) + temp_photo = os.path.join(temp_dir, f"photo_{uuid.uuid4()}.jpg") temp_photos.append(temp_photo) - await photo_file.download_to_drive(temp_photo) + prepare_downloaded_image_for_vision(temp_download, temp_photo) user_images_dir = os.path.join("data", user_id, "images") os.makedirs(user_images_dir, exist_ok=True) @@ -1712,7 +1729,7 @@ async def handle_document_message(update: Update, context: ContextTypes.DEFAULT_ await handle_video_message(update, context) return - image_extensions = ['.jpg', '.jpeg'] + image_extensions = IMAGE_EXTENSIONS if file_extension in image_extensions: stats_tracker.track_message_received(user_id, "photo") if update.message.media_group_id: @@ -1728,7 +1745,8 @@ async def handle_document_message(update: Update, context: ContextTypes.DEFAULT_ if file_extension not in supported_extensions: supported_formats = ", ".join([ext.replace(".", "").upper() for ext in supported_extensions]) - await update.message.reply_text(f"❌ Only {supported_formats} documents and JPG/JPEG images are supported.") + supported_image_formats = "/".join(ext.replace(".", "").upper() for ext in image_extensions) + await update.message.reply_text(f"❌ Only {supported_formats} documents and {supported_image_formats} images are supported.") return async with get_user_lock(user_id): diff --git a/app/requirements.txt b/app/requirements.txt index 8db5241..d93c36b 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -21,4 +21,5 @@ docx2txt>=0.8 pandas>=2.0.0 openpyxl>=3.1.2 google-genai>=1.45.0 -pillow>=11.1.0 \ No newline at end of file +pillow>=11.1.0 +pillow-heif>=1.4.0 From 9630112120cdce73b1f0dcb9c94eb415e4c010eb Mon Sep 17 00:00:00 2001 From: mbv06 Date: Fri, 12 Jun 2026 01:45:50 +0300 Subject: [PATCH 7/7] Handle late media group image updates --- app/main_bot.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/app/main_bot.py b/app/main_bot.py index ecd9857..fe1f2a2 100644 --- a/app/main_bot.py +++ b/app/main_bot.py @@ -118,6 +118,7 @@ def get_user_lock(user_id: str) -> asyncio.Lock: media_group_captions = {} media_group_waiting_message = {} media_group_tasks = {} +media_group_processing = {} MEDIA_GROUP_TIMEOUT = 10.0 @@ -1498,7 +1499,10 @@ async def voice_button(update: Update, context: ContextTypes.DEFAULT_TYPE): async def queue_image_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str, image_ref: Any, caption: str | None): """Collect Telegram album image refs and process them after updates stop arriving.""" - if user_id in media_group_tasks and media_group_tasks[user_id]: + is_same_media_group = user_id in media_group_id and media_group_id[user_id] == update.message.media_group_id + is_processing = media_group_processing.get(user_id) and is_same_media_group + + if user_id in media_group_tasks and media_group_tasks[user_id] and not is_processing: media_group_tasks[user_id].cancel() if user_id not in media_group_id or media_group_id[user_id] != update.message.media_group_id: @@ -1515,10 +1519,15 @@ async def queue_image_media_group(update: Update, context: ContextTypes.DEFAULT_ if image_ref not in media_group_photos[user_id]: media_group_photos[user_id].append(image_ref) - await media_group_waiting_message[user_id].edit_text( - f"🖼️ *Image {len(media_group_photos[user_id])} received... Waiting for other images...*", - parse_mode="markdown" - ) + waiting_message = media_group_waiting_message.get(user_id) + if waiting_message and not is_processing: + await waiting_message.edit_text( + f"🖼️ *Image {len(media_group_photos[user_id])} received... Waiting for other images...*", + parse_mode="markdown" + ) + + if is_processing: + return async def process_media_group_with_timeout(): try: @@ -1534,13 +1543,13 @@ async def process_media_group_with_timeout(): async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE, user_id: str): """Process all images in a media group""" + media_group_processing[user_id] = True try: if user_id not in media_group_photos or not media_group_photos[user_id]: print(f"User {user_id} is waiting for media group, but it is not in media_group_photos") return status_message = media_group_waiting_message.get(user_id) - media_group_waiting_message[user_id] = None image_refs = media_group_photos[user_id] @@ -1561,6 +1570,8 @@ async def process_media_group(update: Update, context: ContextTypes.DEFAULT_TYPE media_group_waiting_message[user_id] = None if user_id in media_group_tasks: del media_group_tasks[user_id] + if user_id in media_group_processing: + del media_group_processing[user_id] async def handle_photo_message(update: Update, context: ContextTypes.DEFAULT_TYPE): """Handle incoming photo messages with support for multiple photos"""