-
Notifications
You must be signed in to change notification settings - Fork 5
Description
root@b0d92f076190:/workspace/RAG-DDR-main/scripts# sh kr_inference.sh
/opt/conda/lib/python3.10/site-packages/torch/cuda/init.py:54: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
模型的所有参数:
dataset_file_path: /workspace/RAG-DDR-main/data/wow_dev_psg.jsonl
model_name_or_path: /workspace/RAG-DDR/Kr_model_for_Llama3_8b
output_path: /workspace/output/kr_results
file_name: kr_results_new.jsonl
top_n: 100
need_n: 5
cut_num: 1
number: 0
batch_size: 1
task: None
INFO 12-10 01:25:33 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', speculative_config=None, tokenizer='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 12-10 01:25:34 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 12-10 01:25:34 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 12-10 01:25:34 selector.py:33] Using XFormers backend.
Traceback (most recent call last):
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 232, in
main()
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 181, in main
reranker = LLMReranker(
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 23, in init
model = LLM(
File "/opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 118, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 148, in init
self.model_executor = executor_class(
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 41, in init
self._init_executor()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 51, in _init_non_spec_worker
self.driver_worker.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py", line 117, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 162, in load_model
self.model = get_model(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
return loader.load_model(model_config=model_config,
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
model.load_weights(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 403, in load_weights
param = params_dict[name]
KeyError: 'base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.weight'
root@b0d92f076190:/workspace/RAG-DDR-main/scripts# sh kr_inference.sh
/opt/conda/lib/python3.10/site-packages/torch/cuda/init.py:54: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
import pynvml # type: ignore[import]
模型的所有参数:
dataset_file_path: /workspace/RAG-DDR-main/data/wow_dev_psg.jsonl
model_name_or_path: /workspace/RAG-DDR/Kr_model_for_Llama3_8b
output_path: /workspace/output/kr_results
file_name: kr_results_new.jsonl
top_n: 100
need_n: 5
cut_num: 1
number: 0
batch_size: 1
task: None
INFO 12-10 01:27:38 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', speculative_config=None, tokenizer='/workspace/RAG-DDR/Kr_model_for_Llama3_8b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 12-10 01:27:38 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 12-10 01:27:39 selector.py:77] Cannot use FlashAttention backend because the flash_attn package is not found. Please install it for better performance.
INFO 12-10 01:27:39 selector.py:33] Using XFormers backend.
Traceback (most recent call last):
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 252, in
main()
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 201, in main
reranker = LLMReranker(
File "/workspace/RAG-DDR-main/src/knowledgeRefinement/kr_inference.py", line 23, in init
model = LLM(
File "/opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 118, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 148, in init
self.model_executor = executor_class(
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 41, in init
self._init_executor()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 22, in _init_executor
self._init_non_spec_worker()
File "/opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py", line 51, in _init_non_spec_worker
self.driver_worker.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py", line 117, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 162, in load_model
self.model = get_model(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
return loader.load_model(model_config=model_config,
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
model.load_weights(
File "/opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py", line 403, in load_weights
param = params_dict[name]
KeyError: 'base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.weight'
环境:
Package Version
accelerate 0.30.1
deepspeed 0.14.2
torch 2.2.1
tqdm 4.67.1
uvloop 0.22.1
vllm 0.4.1
vllm-nccl-cu12 2.18.1.0.4.0
watchfiles 1.1.1
trl 0.8.6
transformers 4.40.0
faiss-cpu 1.10.0
faiss-gpu-cu12 1.11.0