From e480c00ef70b54c9fe00ae419a2f45c6ad3802fb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:01:06 +0000 Subject: [PATCH 1/2] Initial plan From 406ab7331fc935f5f87b1a09b4528ff879469118 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:10:38 +0000 Subject: [PATCH 2/2] feat: integrate Claude Opus vision API for image analysis Co-authored-by: leekHotline <117092932+leekHotline@users.noreply.github.com> --- backend/app/core/config.py | 4 + backend/app/modules/image_analysis/router.py | 2 +- backend/app/modules/image_analysis/service.py | 113 +++++++++++++++--- backend/pyproject.toml | 1 + backend/uv | 0 5 files changed, 105 insertions(+), 15 deletions(-) mode change 100644 => 100755 backend/uv diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 3b1e426..b3bb31c 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -46,6 +46,10 @@ class Settings(BaseSettings): MAX_VIDEO_SIZE: int = 50 * 1024 * 1024 # 50MB ALLOWED_VIDEO_TYPES: list[str] = ["video/mp4", "video/quicktime", "video/webm"] + # Anthropic Claude + ANTHROPIC_API_KEY: str = "" + CLAUDE_MODEL: str = "claude-opus-4-5" + model_config = {"env_file": ".env", "extra": "ignore"} diff --git a/backend/app/modules/image_analysis/router.py b/backend/app/modules/image_analysis/router.py index 4c8aad1..9583131 100644 --- a/backend/app/modules/image_analysis/router.py +++ b/backend/app/modules/image_analysis/router.py @@ -18,5 +18,5 @@ async def describe_image( db: AsyncSession = Depends(get_db), ) -> schemas.ImageDescribeResponse: """Describe an image to help visually impaired users understand its content.""" - result = await service.describe_image(payload.image_file_id, payload.language) + result = await service.describe_image(db, payload.image_file_id, payload.language) return schemas.ImageDescribeResponse(**result) diff --git a/backend/app/modules/image_analysis/service.py b/backend/app/modules/image_analysis/service.py index 11baf08..dbbcbca 100644 --- a/backend/app/modules/image_analysis/service.py +++ b/backend/app/modules/image_analysis/service.py @@ -1,20 +1,105 @@ """Image analysis business logic.""" +import base64 +import logging -async def describe_image(image_file_id: str, language: str) -> dict: - """Describe image content for visually impaired users. +import anthropic +from sqlalchemy.ext.asyncio import AsyncSession - This is a placeholder implementation. In production, this would call - a real vision AI service (e.g., GPT-4 Vision, Tencent Cloud OCR, etc.) - to generate a natural-language description of the image. +from app.core.config import settings +from app.modules.uploads.service import get_upload_content_path, get_uploaded_file - When the image is blurry or unclear, the service should indicate that - and provide the best possible description. +logger = logging.getLogger(__name__) + +_FALLBACK_RESPONSE = { + "description": "图片描述服务暂时不可用,请稍后重试。", + "is_clear": True, + "clarity_note": None, + "confidence": 0.0, +} + + +async def describe_image(db: AsyncSession, image_file_id: str, language: str) -> dict: + """Describe image content for visually impaired users using Claude vision API. + + Falls back to a placeholder when no API key is configured or the file + cannot be retrieved from storage. """ - # TODO: integrate real vision AI service - return { - "description": f"[Image description placeholder for file: {image_file_id}]", - "is_clear": True, - "clarity_note": None, - "confidence": 0.0, - } + if not settings.ANTHROPIC_API_KEY: + return { + "description": f"[Image description placeholder for file: {image_file_id}]", + "is_clear": True, + "clarity_note": None, + "confidence": 0.0, + } + + record = await get_uploaded_file(db, image_file_id) + if record is None: + return _FALLBACK_RESPONSE + + try: + file_path = get_upload_content_path(record) + image_data = file_path.read_bytes() + except (FileNotFoundError, OSError) as exc: + logger.warning("Failed to read image file %s: %s", image_file_id, exc) + return _FALLBACK_RESPONSE + + image_b64 = base64.b64encode(image_data).decode("utf-8") + + if language.startswith("zh"): + system_prompt = ( + "你是一个专门帮助视觉障碍用户的AI助手。" + "请用清晰、详细的语言描述图片内容,包括主要对象、颜色、场景和任何重要细节。" + "如果图片模糊或不清晰,请说明这一点。" + ) + user_text = "请描述这张图片的内容。" + else: + system_prompt = ( + "You are an AI assistant specialized in helping visually impaired users. " + "Describe the image content clearly and in detail, including main objects, " + "colors, scene, and any important details. " + "If the image is blurry or unclear, please note that." + ) + user_text = "Please describe this image." + + try: + client = anthropic.AsyncAnthropic(api_key=settings.ANTHROPIC_API_KEY) + message = await client.messages.create( + model=settings.CLAUDE_MODEL, + max_tokens=1024, + system=system_prompt, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": record.mime_type, + "data": image_b64, + }, + }, + { + "type": "text", + "text": user_text, + }, + ], + } + ], + ) + + description = message.content[0].text if message.content else "" + is_unclear = any( + word in description.lower() + for word in ("blurry", "unclear", "模糊", "不清晰", "看不清") + ) + return { + "description": description, + "is_clear": not is_unclear, + "clarity_note": "图片可能模糊或不清晰" if is_unclear else None, + "confidence": 0.95, + } + except Exception as exc: + logger.error("Claude API call failed for image %s: %s", image_file_id, exc) + return _FALLBACK_RESPONSE diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 3821bba..2606cc6 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "pydantic-settings>=2.12.0", "python-jose[cryptography]>=3.5.0", "python-multipart>=0.0.22", + "anthropic>=0.40.0", "sqlalchemy[asyncio]>=2.0.46", "uvicorn[standard]>=0.40.0", ] diff --git a/backend/uv b/backend/uv old mode 100644 new mode 100755