diff --git a/backend/app.py b/backend/app.py index 7af84dd..551e42c 100644 --- a/backend/app.py +++ b/backend/app.py @@ -35,4 +35,4 @@ def create_app(): if __name__ == '__main__': print(f"Starting Flask app with SQLite persistence.") print(f"Database will be stored at: {app.config['SQLALCHEMY_DATABASE_URI']}") - app.run(debug=False, host='0.0.0.0', port=5001) + app.run(debug=False, host='0.0.0.0', port=5000) diff --git a/backend/google_veo.py b/backend/google_veo.py index 94fb54a..c86162c 100644 --- a/backend/google_veo.py +++ b/backend/google_veo.py @@ -42,6 +42,7 @@ def _compose_videogen_request( camera_control: str = "", generate_audio: bool = False, resolution: Optional[str] = None, + reference_images: Optional[list] = None, ): if self.model_name.startswith("veo-3.0"): if "durationSeconds" in parameters and isinstance(parameters["durationSeconds"], (int, float)) and parameters["durationSeconds"] > 90: @@ -64,6 +65,11 @@ def _compose_videogen_request( instance["video"] = {"gcsUri": video_uri, "mimeType": "video/mp4"} if last_frame_uri: instance["lastFrame"] = {"gcsUri": last_frame_uri, "mimeType": last_frame_mime_type} + + # Add referenceImages support for veo-2.0-generate-exp + if reference_images and self.model_name == "veo-2.0-generate-exp": + instance["referenceImages"] = reference_images + # Only add cameraControl if the model supports it, it's provided, AND it's not a video extension task if self.model_name != "veo-3.0-generate-001" and camera_control and not video_uri: instance["cameraControl"] = camera_control @@ -98,6 +104,7 @@ def generate_video( camera_control: str = "", generate_audio: bool = False, resolution: Optional[str] = None, + reference_images: Optional[list] = None, ): req = self._compose_videogen_request( prompt=prompt, @@ -110,6 +117,7 @@ def generate_video( camera_control=camera_control, generate_audio=generate_audio, resolution=resolution, + reference_images=reference_images, ) print(f"Sending video generation request: {req}") resp = self._send_request_to_google_api(self.prediction_endpoint, req) diff --git a/backend/migrate_db.py b/backend/migrate_db.py index 0d23d0e..5986508 100644 --- a/backend/migrate_db.py +++ b/backend/migrate_db.py @@ -156,6 +156,9 @@ def setup_database(): # The model uses db.Boolean, so SQLAlchemy handles the abstraction. # When adding manually, 'BOOLEAN' should be acceptable for both via SQLAlchemy's engine. migrate_schema_add_column(engine, 'video_generation_task', 'generate_audio', 'BOOLEAN') + + # Add reference_images_data column for storing reference images JSON data + migrate_schema_add_column(engine, 'video_generation_task', 'reference_images_data', 'TEXT') # Backfill data migrate_data_backfill_user_column(engine) diff --git a/backend/models.py b/backend/models.py index b7dd580..8f861f1 100644 --- a/backend/models.py +++ b/backend/models.py @@ -23,6 +23,7 @@ class VideoGenerationTask(db.Model): last_frame_filename = db.Column(db.String(255), nullable=True) # Filename of the uploaded last frame image last_frame_gcs_uri = db.Column(db.String(1024), nullable=True) # GCS URI of the uploaded last frame image video_uri = db.Column(db.String(1024), nullable=True) # User-added: new video_uri attribute + reference_images_data = db.Column(db.Text, nullable=True) # JSON string for reference images data error_message = db.Column(db.String(1024), nullable=True) user = db.Column(db.String(255), nullable=True) # New field for user email generate_audio = db.Column(db.Boolean, default=False) diff --git a/backend/routes/video.py b/backend/routes/video.py index 6601519..4293ecc 100644 --- a/backend/routes/video.py +++ b/backend/routes/video.py @@ -25,7 +25,7 @@ def generate_video_route(): model = request.form.get('model', DEFAULT_VIDEO_MODEL) aspect_ratio = request.form.get('ratio', '16:9') camera_control = request.form.get('camera_control', 'FIXED') # Get camera_control - duration_seconds = int(request.form.get('duration', 5)) + duration_seconds = int(request.form.get('durationSeconds', 8)) resolution = request.form.get('resolution', None) gcs_output_bucket = request.form.get('gcs_output_bucket', None) generate_audio = request.form.get('generateAudio', 'false').lower() == 'true' @@ -52,6 +52,41 @@ def generate_video_route(): last_frame_filename_to_save = last_frame_img_filename print(f"Saved uploaded last frame image to: {last_frame_image_path}") + # Handle reference images for veo-2.0-generate-exp + reference_images_data = None + if model == 'veo-2.0-generate-exp' and aspect_ratio == '16:9': + reference_images_list = [] + reference_type = request.form.get('reference_type', 'asset') # Default to 'asset' + + # Determine max images based on type: asset=3, style=1 + max_images = 3 if reference_type == 'asset' else 1 + + # Handle multiple reference image files + for i in range(max_images): + ref_file_key = f'reference_image_{i}' + ref_file = request.files.get(ref_file_key) + if ref_file and allowed_file(ref_file.filename): + original_extension_ref = os.path.splitext(ref_file.filename)[1] + ref_img_filename = secure_filename(f"{uuid.uuid4()}_ref_{i}{original_extension_ref}") + ref_image_path = os.path.join(uploads_dir, ref_img_filename) + ref_file.save(ref_image_path) + + reference_images_list.append({ + 'filename': ref_img_filename, + 'type': reference_type # Use the same type for all images in this request + }) + print(f"Saved uploaded reference image {i} to: {ref_image_path}") + + # Validate image count based on type + if reference_images_list: + if reference_type == 'style' and len(reference_images_list) > 1: + return jsonify({"error": "Style type supports maximum 1 reference image"}), 400 + elif reference_type == 'asset' and len(reference_images_list) > 3: + return jsonify({"error": "Asset type supports maximum 3 reference images"}), 400 + + import json + reference_images_data = json.dumps(reference_images_list) + new_task = VideoGenerationTask( prompt=prompt_text, model=model, @@ -62,6 +97,7 @@ def generate_video_route(): gcs_output_bucket=gcs_output_bucket, image_filename=image_filename_to_save, last_frame_filename=last_frame_filename_to_save, + reference_images_data=reference_images_data, user=user_email, generate_audio=generate_audio ) diff --git a/backend/tasks.py b/backend/tasks.py index a7c1f04..9703e1f 100644 --- a/backend/tasks.py +++ b/backend/tasks.py @@ -31,20 +31,12 @@ def _run_video_generation(app, task_id): try: # Model specific checks based on user feedback - # User feedback: "veo-3.0-generate-preview dosen't support lart frame image and 9:16 ratio" - # Assuming "lart frame" means "last frame" - TARGET_MODEL_FOR_CHECKS = "veo-3.0-generate-001" # Or the correct model name if this is a typo - - if task.model == TARGET_MODEL_FOR_CHECKS: + # Both veo-3.0-generate-001 and veo-3.0-fast-generate-001 now support 9:16 aspect ratio + # Only last frame image is still not supported for veo-3.0 models + if task.model.startswith('veo-3.0'): if task.last_frame_filename: task.status = "failed" - task.error_message = f"Model {TARGET_MODEL_FOR_CHECKS} does not support last frame images." - db.session.commit() - print(f"Task {task_id} failed: {task.error_message}") - return - if task.aspect_ratio == "9:16": - task.status = "failed" - task.error_message = f"Model {TARGET_MODEL_FOR_CHECKS} does not support 9:16 aspect ratio." + task.error_message = f"Model {task.model} does not support last frame images." db.session.commit() print(f"Task {task_id} failed: {task.error_message}") return @@ -154,6 +146,48 @@ def _run_video_generation(app, task_id): else: print(f"Last frame image file {task.last_frame_filename} not found for task {task_id}") + # Process reference images for veo-2.0-generate-exp + reference_images_for_api = None + if task.model == "veo-2.0-generate-exp" and hasattr(task, 'reference_images_data') and task.reference_images_data: + reference_images_for_api = [] + try: + import json + reference_images_list = json.loads(task.reference_images_data) if isinstance(task.reference_images_data, str) else task.reference_images_data + + for ref_img in reference_images_list: + if 'filename' in ref_img and 'type' in ref_img: + ref_img_path = os.path.join(uploads_dir, ref_img['filename']) + if os.path.exists(ref_img_path): + # Upload reference image to GCS + if DEFAULT_OUTPUT_GCS_BUCKET: + storage_client_ref = storage.Client() + ref_bucket_name = DEFAULT_OUTPUT_GCS_BUCKET.replace("gs://", "") + bucket_ref = storage_client_ref.bucket(ref_bucket_name) + base_ref_filename = os.path.basename(ref_img['filename']) + ref_blob_name = f"reference_images/{task.id}/{base_ref_filename}" + blob_ref = bucket_ref.blob(ref_blob_name) + + # Determine MIME type + ref_mime_type = "image/jpeg" + filename_lower = ref_img['filename'].lower() + if filename_lower.endswith(".png"): + ref_mime_type = "image/png" + elif filename_lower.endswith(".gif"): + ref_mime_type = "image/gif" + + blob_ref.upload_from_filename(ref_img_path, content_type=ref_mime_type) + ref_gcs_uri = f"gs://{ref_bucket_name}/{ref_blob_name}" + + # Add to API format + reference_images_for_api.append({ + "image": {"gcsUri": ref_gcs_uri, "mimeType": ref_mime_type}, + "referenceType": ref_img['type'] # "asset" or "style" + }) + print(f"Successfully uploaded reference image {ref_img['filename']} to {ref_gcs_uri}") + except Exception as e_ref: + print(f"Error processing reference images for task {task_id}: {e_ref}") + # Continue without reference images if there's an error + # Call GoogleVeo to generate video # Note: model_to_use (task.model or DEFAULT_VIDEO_MODEL) is not used here as GoogleVeo class has a hardcoded model. # This might be a point of future enhancement if model selection is needed with GoogleVeo. @@ -167,7 +201,8 @@ def _run_video_generation(app, task_id): last_frame_mime_type=current_last_frame_mime_type, camera_control=task.camera_control, # Pass camera_control directly generate_audio=task.generate_audio, - resolution=task.resolution + resolution=task.resolution, + reference_images=reference_images_for_api ) # Process the result from GoogleVeo diff --git a/frontend/public/locales/en/translation.json b/frontend/public/locales/en/translation.json index 4bad5eb..57289f1 100644 --- a/frontend/public/locales/en/translation.json +++ b/frontend/public/locales/en/translation.json @@ -142,5 +142,21 @@ "videoEditingUnderDevelopmentNoticeTitle": "Video Editing Feature - Under Development", "videoEditingUnderDevelopmentNoticeBody": "The video editing (Weave) mode is currently under active development. You may experience performance issues or encounter bugs.", "videoEditingUnderDevelopmentNoticeSuggestion": "This feature is CPU intensive. If you find it very slow, please consider upscaling your server resources.", - "page": "Page" + "page": "Page", + "referenceTab": "Reference", + "referenceLabel": "Reference Images", + "referenceTypeLabel": "Type", + "referenceTypeAsset": "Asset", + "referenceTypeStyle": "Style", + "addReferenceImageButton": "Add Reference", + "removeReferenceImageButton": "Remove", + "referenceImagePreviewAlt": "Reference image preview", + "referenceNotSupportedMessage": "Reference images are not supported by this model.", + "maxReferenceImagesReached": "Maximum {{max}} reference images allowed.", + "maxAssetImagesReached": "Asset type supports maximum 3 reference images", + "maxStyleImagesReached": "Style type supports maximum 1 reference image", + "referenceImagesOnly16x9": "Reference images only support 16:9 ratio and do not support 9:16 resolution", + "uploadReferenceImageButtonTitle": "Upload reference image", + "pasteReferenceImageFromClipboardButtonTitle": "Paste reference image from clipboard", + "clearReferenceImageButtonTitle": "Clear reference image" } diff --git a/frontend/public/locales/zh-CN/translation.json b/frontend/public/locales/zh-CN/translation.json index a043802..e8eeb34 100644 --- a/frontend/public/locales/zh-CN/translation.json +++ b/frontend/public/locales/zh-CN/translation.json @@ -141,5 +141,21 @@ "videoEditingUnderDevelopmentNoticeTitle": "视频编辑功能 - 开发中", "videoEditingUnderDevelopmentNoticeBody": "视频编辑(织梦)模式目前正在积极开发中。您可能会遇到性能问题或错误。", "videoEditingUnderDevelopmentNoticeSuggestion": "此功能占用大量CPU资源。如果运行非常缓慢,请考虑升级您的服务器配置。", - "page": "页" + "page": "页", + "referenceTab": "参考图", + "referenceLabel": "参考图像", + "referenceTypeLabel": "类型", + "referenceTypeAsset": "资产", + "referenceTypeStyle": "风格", + "addReferenceImageButton": "添加参考图", + "removeReferenceImageButton": "移除", + "referenceImagePreviewAlt": "参考图像预览", + "referenceNotSupportedMessage": "此模型不支持参考图像功能。", + "maxReferenceImagesReached": "最多只能添加 {{max}} 张参考图像。", + "maxAssetImagesReached": "资产类型最多只能添加 3 张参考图像", + "maxStyleImagesReached": "风格类型最多只能添加 1 张参考图像", + "referenceImagesOnly16x9": "参考图功能仅支持16:9比例,且不支持9:16分辨率", + "uploadReferenceImageButtonTitle": "上传参考图像", + "pasteReferenceImageFromClipboardButtonTitle": "从剪贴板粘贴参考图像", + "clearReferenceImageButtonTitle": "清除参考图像" } diff --git a/frontend/src/App.js b/frontend/src/App.js index 38c96cc..f5120fc 100644 --- a/frontend/src/App.js +++ b/frontend/src/App.js @@ -41,7 +41,7 @@ function App() { const [model, setModel] = useState('veo-3.0-generate-001'); // Default model const [ratio, setRatio] = useState('16:9'); // Default ratio const [cameraControl, setCameraControl] = useState(''); // Default camera control - const [duration, setDuration] = useState(5); // Default duration in seconds, changed to 5 + const [duration, setDuration] = useState(8); // Default duration in seconds, changed to 8 for better compatibility const [resolution, setResolution] = useState('720p'); // Default resolution const [gcsOutputBucket, setGcsOutputBucket] = useState(''); // GCS output bucket const [generateAudio, setGenerateAudio] = useState(false); @@ -92,6 +92,10 @@ function App() { const [generatedLastFrameImageUrl, setGeneratedLastFrameImageUrl] = useState(''); // To store the URL from backend for last frame const [lastFrameImageGenerationError, setLastFrameImageGenerationError] = useState(''); // Error specific to last frame + // Reference images state for veo-2.0-generate-exp + const [referenceImages, setReferenceImages] = useState([]); // Array of {id, file, preview, type} + const [referenceType, setReferenceType] = useState('asset'); // 'asset' or 'style' + const handleClearMusicSelection = () => { setSelectedMusicFile(null); setUploadedMusicBackendUrl(null); @@ -206,6 +210,7 @@ function App() { const fileInputRef = useRef(null); // Ref for the file input element const lastImagePreviewRef = useRef(null); // New ref for last image preview const lastFileInputRef = useRef(null); // New ref for last image file input + const referenceFileInputRef = useRef(null); // New ref for reference image file input const userDropdownRef = useRef(null); // Ref for user dropdown // Function to ensure track playback is stopped @@ -267,11 +272,27 @@ function App() { if (model !== 'veo-2.0-generate-exp') { setCameraControl(''); // Reset to default if model does not support camera control } - if (model === 'veo-3.0-generate-001' || model === 'veo-2.0-generate-exp') { - if (duration !== 8) { - setDuration(8); + if (model === 'veo-2.0-generate-exp') { + // veo-2.0-generate-exp supports 5,6,7,8s, but when using Reference, only 8s is supported + const isUsingReference = referenceImages.length > 0; + if (isUsingReference) { + // When using Reference, only 8s is supported + if (duration !== 8) { + setDuration(8); + } + } else { + // When not using Reference, 5,6,7,8s are supported + if (![5, 6, 7, 8].includes(duration)) { + setDuration(8); // Default to 8s for veo-2.0-generate-exp + } + } + } else if (model.startsWith('veo-3.0')) { + // veo-3.0 models support 4s, 6s, 8s + if (![4, 6, 8].includes(duration)) { + setDuration(8); // Default to 8s for veo-3.0 models } } else { + // veo-2.0-generate-001 supports 5s, 6s, 7s, 8s if (![5, 6, 7, 8].includes(duration)) { setDuration(5); } @@ -290,15 +311,29 @@ function App() { setActiveImageTab('first'); } - // Limitation: No 9:16 aspect ratio - if (ratio === '9:16') { - setRatio('16:9'); // Default to 16:9 - } + // Note: 9:16 aspect ratio is now supported for veo-3.0 models } else { setResolution('720p'); } // eslint-disable-next-line react-hooks/exhaustive-deps - }, [model, duration, selectedLastImage, ratio, activeImageTab, setCameraControl, setDuration, setRatio, setActiveImageTab]); // Added setters to dependency array as per exhaustive-deps, clearLastImagePreview is defined in scope + }, [model, selectedLastImage, ratio, activeImageTab, referenceImages.length]); // Removed duration from dependencies to prevent infinite loops + + // Effect to clear reference images when first/last frame images are uploaded or 9:16 ratio is selected + useEffect(() => { + const shouldClearReference = + (imagePreview || lastImagePreview) || // Has first or last frame image + ratio === '9:16'; // Or using 9:16 ratio + + if (shouldClearReference && referenceImages.length > 0) { + // Clear all reference images + setReferenceImages([]); + + // If currently on reference tab, switch to first frame tab + if (activeImageTab === 'reference') { + setActiveImageTab('first'); + } + } + }, [imagePreview, lastImagePreview, ratio, referenceImages.length, activeImageTab]); // Effect to stop track playback if view changes from 'create' while playing const prevActiveViewRef = useRef(); @@ -623,8 +658,62 @@ function App() { const doHandleLastImageChange = (e) => Handlers.handleLastImageChange(e, setSelectedLastImage, setLastImagePreview, lastFileInputRef); const doClearLastImagePreview = () => Handlers.clearLastImagePreview(setSelectedLastImage, setLastImagePreview, lastFileInputRef); + // Reference image handlers + const handleReferenceImageChange = (e) => { + const file = e.target.files[0]; + const maxImages = referenceType === 'asset' ? 3 : 1; + + if (file && referenceImages.length < maxImages) { + const reader = new FileReader(); + reader.onloadend = () => { + const newReferenceImage = { + id: Date.now() + Math.random(), // Simple unique ID + file: file, + preview: reader.result, + type: referenceType + }; + setReferenceImages(prev => [...prev, newReferenceImage]); + }; + reader.readAsDataURL(file); + } + // Clear the input + if (referenceFileInputRef.current) { + referenceFileInputRef.current.value = ''; + } + }; + + // Handle reference type change with automatic cleanup + const handleReferenceTypeChange = (newType) => { + setReferenceType(newType); + + // If switching to style type and there are more than 1 image, keep only the first one + if (newType === 'style' && referenceImages.length > 1) { + setReferenceImages(prev => [prev[0]]); + } + + // Update the type of existing images + setReferenceImages(prev => prev.map(img => ({ ...img, type: newType }))); + }; + + const handleRemoveReferenceImage = (id) => { + setReferenceImages(prev => prev.filter(img => img.id !== id)); + }; + + // Enhanced paste handler - reference images do not support clipboard paste + const doHandlePasteFromClipboardEnhanced = (target) => { + if (target === 'reference') { + // Reference images do not support clipboard paste + console.log('Clipboard paste is not supported for reference images'); + return; + } else { + // Use original handler for first and last frame + Handlers.handlePasteFromClipboard(target, setSelectedImage, setImagePreview, setSelectedLastImage, setLastImagePreview, setErrorMessage, t); + } + }; + const doHandleGenerateClick = () => Api.handleGenerateClick({ prompt, model, ratio, cameraControl, duration, gcsOutputBucket, selectedImage, selectedLastImage, generateAudio, resolution, + referenceImages, referenceType, setIsLoading, setErrorMessage, setVideoGcsUri, setTaskStatus, setCompletedUriPollRetries, pollingIntervalId, setPollingIntervalId, setTaskId, getTasks: memoizedFetchHistoryTasks, t, }); @@ -800,7 +889,7 @@ function App() { onClearImagePreview={doClearImagePreview} fileInputRef={fileInputRef} onImageChange={doHandleImageChange} - onPasteFromClipboard={doHandlePasteFromClipboard} + onPasteFromClipboard={doHandlePasteFromClipboardEnhanced} onGenerateFirstFrameImage={handleGenerateFirstFrameImage} isGeneratingFirstFrame={isGeneratingFirstFrame} lastImagePreviewRef={lastImagePreviewRef} @@ -810,6 +899,13 @@ function App() { onLastImageChange={doHandleLastImageChange} onGenerateLastFrameImage={handleGenerateLastFrameImage} isGeneratingLastFrame={isGeneratingLastFrame} + // Reference Props for veo-2.0-generate-exp + referenceImages={referenceImages} + referenceType={referenceType} + onReferenceTypeChange={handleReferenceTypeChange} + referenceFileInputRef={referenceFileInputRef} + onReferenceImageChange={handleReferenceImageChange} + onRemoveReferenceImage={handleRemoveReferenceImage} ratio={ratio} onRatioChange={setRatio} cameraControl={cameraControl} diff --git a/frontend/src/api.js b/frontend/src/api.js index 244de2b..df78aff 100644 --- a/frontend/src/api.js +++ b/frontend/src/api.js @@ -231,6 +231,8 @@ export const handleGenerateClick = async ({ selectedLastImage, generateAudio, resolution, + referenceImages, + referenceType, setIsLoading, setErrorMessage, setVideoGcsUri, @@ -262,7 +264,7 @@ export const handleGenerateClick = async ({ payload.append('model', model); payload.append('ratio', ratio); payload.append('camera_control', cameraControl); - payload.append('duration', parseInt(duration, 10)); + payload.append('durationSeconds', parseInt(duration, 10)); if (gcsOutputBucket.trim()) { payload.append('gcs_output_bucket', gcsOutputBucket.trim()); } @@ -277,6 +279,16 @@ export const handleGenerateClick = async ({ payload.append('resolution', resolution); } + // Handle reference images for veo-2.0-generate-exp + if (model === 'veo-2.0-generate-exp' && ratio === '16:9' && referenceImages && referenceImages.length > 0) { + payload.append('reference_type', referenceType); + referenceImages.forEach((refImg, index) => { + if (refImg.file) { + payload.append(`reference_image_${index}`, refImg.file); + } + }); + } + const response = await fetch(`${BACKEND_URL}/generate-video`, { method: 'POST', body: payload, @@ -386,7 +398,7 @@ export const pollTaskStatus = async ({ setModel(data.model || 'veo-3.0-generate-001'); setRatio(data.aspect_ratio || '16:9'); setCameraControl(data.camera_control || 'FIXED'); - setDuration(data.duration_seconds || 5); + setDuration(data.duration_seconds || 8); setResolution(data.resolution || ''); setGcsOutputBucket(data.gcs_output_bucket || ''); } @@ -455,7 +467,7 @@ export const handleDeleteTask = async ({ setModel('veo-3.0-generate-001'); setRatio('16:9'); setCameraControl('FIXED'); - setDuration(5); + setDuration(8); setGcsOutputBucket(''); setTaskId(null); setTaskStatus(''); diff --git a/frontend/src/components/Sidebar.js b/frontend/src/components/Sidebar.js index ec9b7ec..d2eb6d8 100644 --- a/frontend/src/components/Sidebar.js +++ b/frontend/src/components/Sidebar.js @@ -31,6 +31,13 @@ function Sidebar({ onLastImageChange, onGenerateLastFrameImage, // New prop isGeneratingLastFrame, // New prop + // Reference Props for veo-2.0-generate-exp + referenceImages, + referenceType, + onReferenceTypeChange, + referenceFileInputRef, + onReferenceImageChange, + onRemoveReferenceImage, ratio, onRatioChange, cameraControl, @@ -127,6 +134,17 @@ function Sidebar({ {t('lastFrameTab')} + {model === 'veo-2.0-generate-exp' && ratio === '16:9' && !imagePreview && !lastImagePreview && ( +
- {t('aspectRatio9x16Warning')} -
- )}