Skip to content

执行kr推理报错 #4

@luohuanhuan2019

Description

@luohuanhuan2019

root@b0d92f076190:/workspace/RAG-DDR-main/scripts# sh kr_inference.sh
/opt/conda/lib/python3.10/site-packages/torch/cuda/init.py:54: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
模型的所有参数:
dataset_file_path: /workspace/RAG-DDR-main/data/wow_dev_psg.jsonl
model_name_or_path: /workspace/RAG-DDR/Kr_model_for_Llama3_8b
output_path: /workspace/output/kr_results
file_name: kr_results_new.jsonl
top_n: 100
need_n: 5
cut_num: 1
number: 0
batch_size: 1
task: None
INFO 12-10 01:25:33 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', speculative_config=None, tokenizer='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 12-10 01:25:34 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 12-10 01:25:34 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 12-10 01:25:34 selector.py:33] Using XFormers backend.
Traceback (most recent call last):
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 232, in
main()
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 181, in main
reranker = LLMReranker(
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 23, in init
model = LLM(
File "/opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 118, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 148, in init
self.model_executor = executor_class(
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 41, in init
self._init_executor()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 51, in _init_non_spec_worker
self.driver_worker.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py", line 117, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 162, in load_model
self.model = get_model(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
return loader.load_model(model_config=model_config,
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
model.load_weights(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 403, in load_weights
param = params_dict[name]
KeyError: 'base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.weight'
root@b0d92f076190:/workspace/RAG-DDR-main/scripts# sh kr_inference.sh
/opt/conda/lib/python3.10/site-packages/torch/cuda/init.py:54: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
模型的所有参数:
dataset_file_path: /workspace/RAG-DDR-main/data/wow_dev_psg.jsonl
model_name_or_path: /workspace/RAG-DDR/Kr_model_for_Llama3_8b
output_path: /workspace/output/kr_results
file_name: kr_results_new.jsonl
top_n: 100
need_n: 5
cut_num: 1
number: 0
batch_size: 1
task: None
INFO 12-10 01:27:38 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', speculative_config=None, tokenizer='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 12-10 01:27:38 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 12-10 01:27:39 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 12-10 01:27:39 selector.py:33] Using XFormers backend.
Traceback (most recent call last):
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 252, in
main()
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 201, in main
reranker = LLMReranker(
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 23, in init
model = LLM(
File "/opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 118, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 148, in init
self.model_executor = executor_class(
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 41, in init
self._init_executor()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 51, in _init_non_spec_worker
self.driver_worker.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py", line 117, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 162, in load_model
self.model = get_model(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
return loader.load_model(model_config=model_config,
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
model.load_weights(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 403, in load_weights
param = params_dict[name]
KeyError: 'base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.weight'

环境:
Package Version


accelerate 0.30.1
deepspeed 0.14.2
torch 2.2.1
tqdm 4.67.1
uvloop 0.22.1
vllm 0.4.1
vllm-nccl-cu12 2.18.1.0.4.0
watchfiles 1.1.1
trl 0.8.6
transformers 4.40.0
faiss-cpu 1.10.0
faiss-gpu-cu12 1.11.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions