diff --git a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md index a7f230f0..917e5400 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md +++ b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md @@ -84,7 +84,7 @@ The input file must contain the image path and a prompt to guide the VQA generat "conversation": [ { "from": "human", - "value": "Generate complete QA pairs based on image content and caption." + "value": "Please generate a relevant question based on the content of the picture, and only output the question content." } ] } @@ -108,7 +108,14 @@ The generated VQA results are stored as text in the `vqa` field, typically conta [ { "image": ["./example_data/image_vqa/person.png"], - "vqa": "- Q: What is the title of the movie shown on the poster?\n A: Nightmare Alley\n\n- Q: What color is the film’s title text?\n A: Gold" + "conversation":[ + { + "from":"human", + "value":"Please generate a relevant question based on the content of the picture, and only output the question content." + } + ], + "question":"Who is the main actor in the movie \"Nightmare Alley\"?", + "answer":"The main actor in the movie \"Nightmare Alley\" is Bradley Cooper." } ] @@ -141,15 +148,15 @@ class ImageVQAPipeline: self.storage = FileStorage( first_entry_file_name="./example_data/image_vqa/sample_data.json", cache_path="./cache_local", - file_name_prefix="qa", + file_name_prefix="qa_api", cache_type="json", ) # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format - key_name_of_api_key="DF_API_KEY", # Set the API key in environment variable or line 4 - model_name="gpt-5-nano-2025-08-07", + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 + model_name="qwen3-vl-8b-instruct", image_io=None, send_request_stream=False, max_workers=10, @@ -165,16 +172,26 @@ class ImageVQAPipeline: # ------------------------------------------------------------------ # def forward(self): input_image_key = "image" - output_answer_key = "vqa" + output_step1_key = "question" + output_step2_key = "answer" + # Step 1: Generate the question for the image self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", input_image_key=input_image_key, - output_answer_key=output_answer_key, + output_answer_key=output_step1_key, ) -# ---------------------------- CLI Entry ------------------------------- # + # Step 2: Generate the answer for the question + self.vqa_generator.run( + storage=self.storage.step(), + input_prompt_key=output_step1_key, + input_image_key=input_image_key, + output_answer_key=output_step2_key, + ) + +# ---------------------------- CLI 入口 -------------------------------- # if __name__ == "__main__": pipe = ImageVQAPipeline() pipe.forward() diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md index 799a867e..58296244 100644 --- a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md +++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md @@ -1,3 +1,8 @@ +--- +title: vision_mct_reasoning_pipeline +createTime: 2026/04/11 20:03:51 +permalink: /en/mm_guide/ifj5ge8z/ +--- ``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md index 67120d2a..08e608aa 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md @@ -44,7 +44,7 @@ dataflowmm init ### 第三步:下载示例数据 ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir dexample_dataa +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data ``` @@ -84,7 +84,7 @@ python api_pipelines/image_vqa.py "conversation": [ { "from": "human", - "value": "Generate complete QA pairs based on image content and caption." + "value": "Please generate a relevant question based on the content of the picture, and only output the question content." } ] } @@ -108,7 +108,14 @@ python api_pipelines/image_vqa.py [ { "image": ["./example_data/image_vqa/person.png"], - "vqa": "- Q: What is the title of the movie shown on the poster?\n A: Nightmare Alley\n\n- Q: What color is the film’s title text?\n A: Gold" + "conversation":[ + { + "from":"human", + "value":"Please generate a relevant question based on the content of the picture, and only output the question content." + } + ], + "question":"Who is the main actor in the movie \"Nightmare Alley\"?", + "answer":"The main actor in the movie \"Nightmare Alley\" is Bradley Cooper." } ] @@ -141,15 +148,15 @@ class ImageVQAPipeline: self.storage = FileStorage( first_entry_file_name="./example_data/image_vqa/sample_data.json", cache_path="./cache_local", - file_name_prefix="qa", + file_name_prefix="qa_api", cache_type="json", ) # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 - model_name="gpt-5-nano-2025-08-07", + model_name="qwen3-vl-8b-instruct", image_io=None, send_request_stream=False, max_workers=10, @@ -165,13 +172,23 @@ class ImageVQAPipeline: # ------------------------------------------------------------------ # def forward(self): input_image_key = "image" - output_answer_key = "vqa" + output_step1_key = "question" + output_step2_key = "answer" + # Step 1: Generate the question for the image self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", input_image_key=input_image_key, - output_answer_key=output_answer_key, + output_answer_key=output_step1_key, + ) + + # Step 2: Generate the answer for the question + self.vqa_generator.run( + storage=self.storage.step(), + input_prompt_key=output_step1_key, + input_image_key=input_image_key, + output_answer_key=output_step2_key, ) # ---------------------------- CLI 入口 -------------------------------- # diff --git a/package-lock.json b/package-lock.json index 9fa2e975..0c0a868e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -168,6 +168,7 @@ "integrity": "sha512-cZ0Iq3OzFUPpgszzDr1G1aJV5UMIZ4VygJ2Az252q4Rdf5cQMhYEIKArWY/oUjMhQmosM8ygOovNq7gvA9CdCg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@algolia/client-common": "5.29.0", "@algolia/requester-browser-xhr": "5.29.0", @@ -2051,6 +2052,7 @@ "integrity": "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/linkify-it": "^5", "@types/mdurl": "^2" @@ -2364,6 +2366,7 @@ "integrity": "sha512-59oBof+QaCyrZVOussrmv3bHxpwFPsLlI9yQbq2ubR+dFNzgfAtb8Dpm2z9iB/duZnx6PgmWPke4qGl9wOjEKw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@vitejs/plugin-vue": "^5.2.3", "@vuepress/bundlerutils": "2.0.0-rc.23", @@ -3265,6 +3268,7 @@ "integrity": "sha512-E2l6AlTWGznM2e7vEE6T6hzObvEyXukxMOlBmVlMyixZyK1umuO/CiVc6sDBbzVH0oEviCE5IfVY1oZBmccYPQ==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@algolia/client-abtesting": "5.29.0", "@algolia/client-analytics": "5.29.0", @@ -3493,6 +3497,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001726", "electron-to-chromium": "^1.5.173", @@ -4107,6 +4112,7 @@ "dev": true, "hasInstallScript": true, "license": "MIT", + "peer": true, "bin": { "esbuild": "bin/esbuild" }, @@ -4259,6 +4265,7 @@ "integrity": "sha512-7Ke1jyybbbPZyZXFxEftUtxFGLMpE2n6A+z//m4CRDlj0hW+o3iYSmh8nFlYMurOiJVDmJRilUQtJr08KfIxlg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "tabbable": "^6.2.0" } @@ -4970,6 +4977,7 @@ "https://github.com/sponsors/katex" ], "license": "MIT", + "peer": true, "dependencies": { "commander": "^8.3.0" }, @@ -5155,6 +5163,7 @@ "integrity": "sha512-a54IwgWPaeBCAAsv13YgmALOF1elABB08FxO9i+r4VFk5Vl4pKokRPeX8u5TCgSsPi6ec1otfLjdOpVcgbpshg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "argparse": "^2.0.1", "entities": "^4.4.0", @@ -5775,6 +5784,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -6143,6 +6153,7 @@ "integrity": "sha512-Ack2K8rc57kCFcYlf3HXpZEJFNUX8xd8DILldksREmYXQkRHI879yy8q4mRDJgrojkySMZqmmmW1NxrFxMsYaA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@bufbuild/protobuf": "^2.5.0", "buffer-builder": "^0.2.0", @@ -6875,6 +6886,7 @@ "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -6957,6 +6969,7 @@ "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -7258,6 +7271,7 @@ "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.7.tgz", "integrity": "sha512-mQYaKepA0NGMBsz8Xktt3tJUG5ELE2iT7IJ+ssXI6nxVdE2sFc/d/6w/JByqMLvWg8hNKHpPgzjgOkrhpKFnrA==", "dev": true, + "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.4.4", @@ -7348,6 +7362,7 @@ "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -7361,6 +7376,7 @@ "integrity": "sha512-LbHV3xPN9BeljML+Xctq4lbz2lVHCR6DtbpTf5XIO6gugpXUN49j2QQPcMj086r9+AkJ0FfUT8xjulKKBkkr9g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.17", "@vue/compiler-sfc": "3.5.17", @@ -7406,6 +7422,7 @@ "integrity": "sha512-XID/zr7qDGLg7oYGwDTZpWRNXCVQcI1wQTfkN0spyumV2EpHe7XBsmnwICd+dTqRNZuD+JHyJsYLEqDEszFObw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@vuepress/cli": "2.0.0-rc.23", "@vuepress/client": "2.0.0-rc.23", @@ -7673,6 +7690,7 @@ "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" },