diff --git a/.gitignore b/.gitignore index 14a220f..08002cc 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ requirement_added.txt mineru.json test/example +test_pipeline/ reasoning_chains cache* ckpt* diff --git a/dataflow/statics/pipelines/api_pipelines/image_vqa.py b/dataflow/statics/pipelines/api_pipelines/image_vqa.py index 37c07ea..122227a 100644 --- a/dataflow/statics/pipelines/api_pipelines/image_vqa.py +++ b/dataflow/statics/pipelines/api_pipelines/image_vqa.py @@ -20,15 +20,15 @@ def __init__(self, llm_serving: LLMServingABC = None): self.storage = FileStorage( first_entry_file_name="./example_data/image_vqa/sample_data.json", cache_path="./cache_local", - file_name_prefix="qa", + file_name_prefix="qa_api", cache_type="json", ) # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 - model_name="gpt-5-nano-2025-08-07", + model_name="qwen3-vl-8b-instruct", image_io=None, send_request_stream=False, max_workers=10, @@ -44,13 +44,23 @@ def __init__(self, llm_serving: LLMServingABC = None): # ------------------------------------------------------------------ # def forward(self): input_image_key = "image" - output_answer_key = "vqa" + output_step1_key = "question" + output_step2_key = "answer" + # Step 1: Generate the question for the image self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", input_image_key=input_image_key, - output_answer_key=output_answer_key, + output_answer_key=output_step1_key, + ) + + # Step 2: Generate the answer for the question + self.vqa_generator.run( + storage=self.storage.step(), + input_prompt_key=output_step1_key, + input_image_key=input_image_key, + output_answer_key=output_step2_key, ) # ---------------------------- CLI 入口 -------------------------------- # diff --git a/dataflow/statics/pipelines/gpu_pipelines/image2qa.py b/dataflow/statics/pipelines/gpu_pipelines/image2qa.py index b5d2726..d2e4532 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/image2qa.py +++ b/dataflow/statics/pipelines/gpu_pipelines/image2qa.py @@ -36,16 +36,26 @@ def __init__(self, llm_serving: LLMServingABC = None): system_prompt="You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content.", ) - self.media_key = "image" - self.output_key = "qa" - # ------------------------- Pipeline 单步 ------------------------- # def forward(self): + input_image_key = "image" + output_step1_key = "question" + output_step2_key = "answer" + + # Step 1: Generate the question for the image self.qa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", - input_image_key=self.media_key, - output_answer_key=self.output_key, + input_image_key=input_image_key, + output_answer_key=output_step1_key, + ) + + # Step 2: Generate the answer for the question + self.qa_generator.run( + storage=self.storage.step(), + input_prompt_key=output_step1_key, + input_image_key=input_image_key, + output_answer_key=output_step2_key, ) # ------------------------------ CLI ------------------------------ #