I have build my own demo file. after uploading one video, it gives blank output. Could anyone help me out?
-------------------------------Here's the demo file-------------------------
from argparse import ArgumentParser
from tasks.eval.model_utils import load_pllava
from tasks.eval.eval_utils import ChatPllava, conv_plain_v1, conv_templates
SYSTEM = """You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
- Follow the user's instruction.
- Be critical yet believe in yourself.
"""
INIT_CONVERSATION = conv_plain_v1.copy()
def init_model(args):
print('Initializing PLLaVA')
model, processor = load_pllava(
args.pretrained_model_name_or_path, args.num_frames,
use_lora=args.use_lora,
weight_dir=args.weight_dir,
lora_alpha=args.lora_alpha,
use_multi_gpus=args.use_multi_gpus)
if not args.use_multi_gpus:
model = model.to('cuda')
chat = ChatPllava(model, processor)
return chat
def run_inference(video_path, question, chat, num_beams=1, temperature=1.0):
# Upload the video
llm_message, img_list, chat_state = chat.upload_video(video_path, INIT_CONVERSATION.copy(), [])
# Ask the question
chat_state = chat.ask(question, chat_state, SYSTEM)
# Get the answer from the model
llm_message, llm_message_token, chat_state = chat.answer(
conv=chat_state,
img_list=img_list,
max_new_tokens=200,
num_beams=num_beams,
temperature=temperature
)
# Print the response
llm_message = llm_message.replace("<s>", "") # Clean up the output
print(f"Question: {question}")
print(f"Answer: {llm_message}")
def parse_args():
parser = ArgumentParser()
parser.add_argument("--pretrained_model_name_or_path", type=str, required=False, default='/home/PLLaVA/MODELS/pllava-13b')
parser.add_argument("--num_frames", type=int, required=False, default=4)
parser.add_argument("--use_lora", action='store_true')
parser.add_argument("--use_multi_gpus", action='store_true')
parser.add_argument("--lora_alpha", type=int, default=None, help="LoRA alpha parameter.")
parser.add_argument("--weight_dir", type=str, required=False, default=None)
parser.add_argument("--conv_mode", type=str, required=False, default='eval_vcgbench')
parser.add_argument("--video_path", type=str, required=False, default="/home/PLLaVA/EDATA/VCGBench/Test_Videos/v_-D1gdv_gQyw.mp4")
parser.add_argument("--question", type=str, required=False, default="describe the video")
parser.add_argument("--num_beams", type=int, default=1, help="Beam search numbers.")
parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for text generation.")
args = parser.parse_args()
return args
if name == "main":
args = parse_args()
# Initialize the model
chat = init_model(args)
INIT_CONVERSATION = conv_templates[args.conv_mode]
# Run inference
run_inference(args.video_path, args.question, chat, num_beams=args.num_beams, temperature=args.temperature)
-------------------------------Here's the output-------------------------------
Initializing PLLaVA
Loading model from /home/PLLaVA/MODELS/pllava-13b
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 6/6 [00:20<00:00, 3.38s/it]
Some weights of PllavaForConditionalGeneration were not initialized from the model checkpoint at /home/PLLaVA/MODELS/pllava-13b and are newly initialized: ['language_model.lm_head.weight', 'language_model.model.embed_tokens.weight', 'language_model.model.layers.0.input_layernorm.weight', 'language_model.model.layers.0.mlp.down_proj.weight', 'language_model.model.layers.0.mlp.gate_proj.weight', 'language_model.model.layers.0.mlp.up_proj.weight', 'language_model.model.layers.0.post_attention_layernorm.weight', 'language_model.model.layers.0.self_attn.k_proj.weight', 'language_model.model.layers.0.self_attn.o_proj.weight', 'language_model.model.layers.0.self_attn.q_proj.weight', 'language_model.model.layers.0.self_attn.v_proj.weight', 'language_model.model.layers.1.input_layernorm.weight', 'language_model.model.layers.1.mlp.down_proj.weight', 'language_model.model.layers.1.mlp.gate_proj.weight', 'language_model.model.layers.1.mlp.up_proj.weight', 'language_model.model.layers.1.post_attention_layernorm.weight', 'language_model.model.layers.1.self_attn.k_proj.weight', 'language_model.model.layers.1.self_attn.o_proj.weight', 'language_model.model.layers.1.self_attn.q_proj.weight', 'language_model.model.layers.1.self_attn.v_proj.weight', 'language_model.model.layers.10.input_layernorm.weight', 'language_model.model.layers.10.mlp.down_proj.weight', 'language_model.model.layers.10.mlp.gate_proj.weight', 'language_model.model.layers.10.mlp.up_proj.weight', 'language_model.model.layers.10.post_attention_layernorm.weight', 'language_model.model.layers.10.self_attn.k_proj.weight', 'language_model.model.layers.10.self_attn.o_proj.weight', 'language_model.model.layers.10.self_attn.q_proj.weight', 'language_model.model.layers.10.self_attn.v_proj.weight', 'language_model.model.layers.11.input_layernorm.weight', 'language_model.model.layers.11.mlp.down_proj.weight', 'language_model.model.layers.11.mlp.gate_proj.weight', 'language_model.model.layers.11.mlp.up_proj.weight', 'language_model.model.layers.11.post_attention_layernorm.weight', 'language_model.model.layers.11.self_attn.k_proj.weight', 'language_model.model.layers.11.self_attn.o_proj.weight', 'language_model.model.layers.11.self_attn.q_proj.weight', 'language_model.model.layers.11.self_attn.v_proj.weight', 'language_model.model.layers.12.input_layernorm.weight', 'language_model.model.layers.12.mlp.down_proj.weight', 'language_model.model.layers.12.mlp.gate_proj.weight', 'language_model.model.layers.12.mlp.up_proj.weight', 'language_model.model.layers.12.post_attention_layernorm.weight', 'language_model.model.layers.12.self_attn.k_proj.weight', 'language_model.model.layers.12.self_attn.o_proj.weight', 'language_model.model.layers.12.self_attn.q_proj.weight', 'language_model.model.layers.12.self_attn.v_proj.weight', 'language_model.model.layers.13.input_layernorm.weight', 'language_model.model.layers.13.mlp.down_proj.weight', 'language_model.model.layers.13.mlp.gate_proj.weight', 'language_model.model.layers.13.mlp.up_proj.weight', 'language_model.model.layers.13.post_attention_layernorm.weight', 'language_model.model.layers.13.self_attn.k_proj.weight', 'language_model.model.layers.13.self_attn.o_proj.weight', 'language_model.model.layers.13.self_attn.q_proj.weight', 'language_model.model.layers.13.self_attn.v_proj.weight', 'language_model.model.layers.14.input_layernorm.weight', 'language_model.model.layers.14.mlp.down_proj.weight', 'language_model.model.layers.14.mlp.gate_proj.weight', 'language_model.model.layers.14.mlp.up_proj.weight', 'language_model.model.layers.14.post_attention_layernorm.weight', 'language_model.model.layers.14.self_attn.k_proj.weight', 'language_model.model.layers.14.self_attn.o_proj.weight', 'language_model.model.layers.14.self_attn.q_proj.weight', 'language_model.model.layers.14.self_attn.v_proj.weight', 'language_model.model.layers.15.input_layernorm.weight', 'language_model.model.layers.15.mlp.down_proj.weight', 'language_model.model.layers.15.mlp.gate_proj.weight', 'language_model.model.layers.15.mlp.up_proj.weight', 'language_model.model.layers.15.post_attention_layernorm.weight', 'language_model.model.layers.15.self_attn.k_proj.weight', 'language_model.model.layers.15.self_attn.o_proj.weight', 'language_model.model.layers.15.self_attn.q_proj.weight', 'language_model.model.layers.15.self_attn.v_proj.weight', 'language_model.model.layers.16.input_layernorm.weight', 'language_model.model.layers.16.mlp.down_proj.weight', 'language_model.model.layers.16.mlp.gate_proj.weight', 'language_model.model.layers.16.mlp.up_proj.weight', 'language_model.model.layers.16.post_attention_layernorm.weight', 'language_model.model.layers.16.self_attn.k_proj.weight', 'language_model.model.layers.16.self_attn.o_proj.weight', 'language_model.model.layers.16.self_attn.q_proj.weight', 'language_model.model.layers.16.self_attn.v_proj.weight', 'language_model.model.layers.17.input_layernorm.weight', 'language_model.model.layers.17.mlp.down_proj.weight', 'language_model.model.layers.17.mlp.gate_proj.weight', 'language_model.model.layers.17.mlp.up_proj.weight', 'language_model.model.layers.17.post_attention_layernorm.weight', 'language_model.model.layers.17.self_attn.k_proj.weight', 'language_model.model.layers.17.self_attn.o_proj.weight', 'language_model.model.layers.17.self_attn.q_proj.weight', 'language_model.model.layers.17.self_attn.v_proj.weight', 'language_model.model.layers.18.input_layernorm.weight', 'language_model.model.layers.18.mlp.down_proj.weight', 'language_model.model.layers.18.mlp.gate_proj.weight', 'language_model.model.layers.18.mlp.up_proj.weight', 'language_model.model.layers.18.post_attention_layernorm.weight', 'language_model.model.layers.18.self_attn.k_proj.weight', 'language_model.model.layers.18.self_attn.o_proj.weight', 'language_model.model.layers.18.self_attn.q_proj.weight', 'language_model.model.layers.18.self_attn.v_proj.weight', 'language_model.model.layers.19.input_layernorm.weight', 'language_model.model.layers.19.mlp.down_proj.weight', 'language_model.model.layers.19.mlp.gate_proj.weight', 'language_model.model.layers.19.mlp.up_proj.weight', 'language_model.model.layers.19.post_attention_layernorm.weight', 'language_model.model.layers.19.self_attn.k_proj.weight', 'language_model.model.layers.19.self_attn.o_proj.weight', 'language_model.model.layers.19.self_attn.q_proj.weight', 'language_model.model.layers.19.self_attn.v_proj.weight', 'language_model.model.layers.2.input_layernorm.weight', 'language_model.model.layers.2.mlp.down_proj.weight', 'language_model.model.layers.2.mlp.gate_proj.weight', 'language_model.model.layers.2.mlp.up_proj.weight', 'language_model.model.layers.2.post_attention_layernorm.weight', 'language_model.model.layers.2.self_attn.k_proj.weight', 'language_model.model.layers.2.self_attn.o_proj.weight', 'language_model.model.layers.2.self_attn.q_proj.weight', 'language_model.model.layers.2.self_attn.v_proj.weight', 'language_model.model.layers.20.input_layernorm.weight', 'language_model.model.layers.20.mlp.down_proj.weight', 'language_model.model.layers.20.mlp.gate_proj.weight', 'language_model.model.layers.20.mlp.up_proj.weight', 'language_model.model.layers.20.post_attention_layernorm.weight', 'language_model.model.layers.20.self_attn.k_proj.weight', 'language_model.model.layers.20.self_attn.o_proj.weight', 'language_model.model.layers.20.self_attn.q_proj.weight', 'language_model.model.layers.20.self_attn.v_proj.weight', 'language_model.model.layers.21.input_layernorm.weight', 'language_model.model.layers.21.mlp.down_proj.weight', 'language_model.model.layers.21.mlp.gate_proj.weight', 'language_model.model.layers.21.mlp.up_proj.weight', 'language_model.model.layers.21.post_attention_layernorm.weight', 'language_model.model.layers.21.self_attn.k_proj.weight', 'language_model.model.layers.21.self_attn.o_proj.weight', 'language_model.model.layers.21.self_attn.q_proj.weight', 'language_model.model.layers.21.self_attn.v_proj.weight', 'language_model.model.layers.22.input_layernorm.weight', 'language_model.model.layers.22.mlp.down_proj.weight', 'language_model.model.layers.22.mlp.gate_proj.weight', 'language_model.model.layers.22.mlp.up_proj.weight', 'language_model.model.layers.22.post_attention_layernorm.weight', 'language_model.model.layers.22.self_attn.k_proj.weight', 'language_model.model.layers.22.self_attn.o_proj.weight', 'language_model.model.layers.22.self_attn.q_proj.weight', 'language_model.model.layers.22.self_attn.v_proj.weight', 'language_model.model.layers.23.input_layernorm.weight', 'language_model.model.layers.23.mlp.down_proj.weight', 'language_model.model.layers.23.mlp.gate_proj.weight', 'language_model.model.layers.23.mlp.up_proj.weight', 'language_model.model.layers.23.post_attention_layernorm.weight', 'language_model.model.layers.23.self_attn.k_proj.weight', 'language_model.model.layers.23.self_attn.o_proj.weight', 'language_model.model.layers.23.self_attn.q_proj.weight', 'language_model.model.layers.23.self_attn.v_proj.weight', 'language_model.model.layers.24.input_layernorm.weight', 'language_model.model.layers.24.mlp.down_proj.weight', 'language_model.model.layers.24.mlp.gate_proj.weight', 'language_model.model.layers.24.mlp.up_proj.weight', 'language_model.model.layers.24.post_attention_layernorm.weight', 'language_model.model.layers.24.self_attn.k_proj.weight', 'language_model.model.layers.24.self_attn.o_proj.weight', 'language_model.model.layers.24.self_attn.q_proj.weight', 'language_model.model.layers.24.self_attn.v_proj.weight', 'language_model.model.layers.25.input_layernorm.weight', 'language_model.model.layers.25.mlp.down_proj.weight', 'language_model.model.layers.25.mlp.gate_proj.weight', 'language_model.model.layers.25.mlp.up_proj.weight', 'language_model.model.layers.25.post_attention_layernorm.weight', 'language_model.model.layers.25.self_attn.k_proj.weight', 'language_model.model.layers.25.self_attn.o_proj.weight', 'language_model.model.layers.25.self_attn.q_proj.weight', 'language_model.model.layers.25.self_attn.v_proj.weight', 'language_model.model.layers.26.input_layernorm.weight', 'language_model.model.layers.26.mlp.down_proj.weight', 'language_model.model.layers.26.mlp.gate_proj.weight', 'language_model.model.layers.26.mlp.up_proj.weight', 'language_model.model.layers.26.post_attention_layernorm.weight', 'language_model.model.layers.26.self_attn.k_proj.weight', 'language_model.model.layers.26.self_attn.o_proj.weight', 'language_model.model.layers.26.self_attn.q_proj.weight', 'language_model.model.layers.26.self_attn.v_proj.weight', 'language_model.model.layers.27.input_layernorm.weight', 'language_model.model.layers.27.mlp.down_proj.weight', 'language_model.model.layers.27.mlp.gate_proj.weight', 'language_model.model.layers.27.mlp.up_proj.weight', 'language_model.model.layers.27.post_attention_layernorm.weight', 'language_model.model.layers.27.self_attn.k_proj.weight', 'language_model.model.layers.27.self_attn.o_proj.weight', 'language_model.model.layers.27.self_attn.q_proj.weight', 'language_model.model.layers.27.self_attn.v_proj.weight', 'language_model.model.layers.28.input_layernorm.weight', 'language_model.model.layers.28.mlp.down_proj.weight', 'language_model.model.layers.28.mlp.gate_proj.weight', 'language_model.model.layers.28.mlp.up_proj.weight', 'language_model.model.layers.28.post_attention_layernorm.weight', 'language_model.model.layers.28.self_attn.k_proj.weight', 'language_model.model.layers.28.self_attn.o_proj.weight', 'language_model.model.layers.28.self_attn.q_proj.weight', 'language_model.model.layers.28.self_attn.v_proj.weight', 'language_model.model.layers.29.input_layernorm.weight', 'language_model.model.layers.29.mlp.down_proj.weight', 'language_model.model.layers.29.mlp.gate_proj.weight', 'language_model.model.layers.29.mlp.up_proj.weight', 'language_model.model.layers.29.post_attention_layernorm.weight', 'language_model.model.layers.29.self_attn.k_proj.weight', 'language_model.model.layers.29.self_attn.o_proj.weight', 'language_model.model.layers.29.self_attn.q_proj.weight', 'language_model.model.layers.29.self_attn.v_proj.weight', 'language_model.model.layers.3.input_layernorm.weight', 'language_model.model.layers.3.mlp.down_proj.weight', 'language_model.model.layers.3.mlp.gate_proj.weight', 'language_model.model.layers.3.mlp.up_proj.weight', 'language_model.model.layers.3.post_attention_layernorm.weight', 'language_model.model.layers.3.self_attn.k_proj.weight', 'language_model.model.layers.3.self_attn.o_proj.weight', 'language_model.model.layers.3.self_attn.q_proj.weight', 'language_model.model.layers.3.self_attn.v_proj.weight', 'language_model.model.layers.30.input_layernorm.weight', 'language_model.model.layers.30.mlp.down_proj.weight', 'language_model.model.layers.30.mlp.gate_proj.weight', 'language_model.model.layers.30.mlp.up_proj.weight', 'language_model.model.layers.30.post_attention_layernorm.weight', 'language_model.model.layers.30.self_attn.k_proj.weight', 'language_model.model.layers.30.self_attn.o_proj.weight', 'language_model.model.layers.30.self_attn.q_proj.weight', 'language_model.model.layers.30.self_attn.v_proj.weight', 'language_model.model.layers.31.input_layernorm.weight', 'language_model.model.layers.31.mlp.down_proj.weight', 'language_model.model.layers.31.mlp.gate_proj.weight', 'language_model.model.layers.31.mlp.up_proj.weight', 'language_model.model.layers.31.post_attention_layernorm.weight', 'language_model.model.layers.31.self_attn.k_proj.weight', 'language_model.model.layers.31.self_attn.o_proj.weight', 'language_model.model.layers.31.self_attn.q_proj.weight', 'language_model.model.layers.31.self_attn.v_proj.weight', 'language_model.model.layers.32.input_layernorm.weight', 'language_model.model.layers.32.mlp.down_proj.weight', 'language_model.model.layers.32.mlp.gate_proj.weight', 'language_model.model.layers.32.mlp.up_proj.weight', 'language_model.model.layers.32.post_attention_layernorm.weight', 'language_model.model.layers.32.self_attn.k_proj.weight', 'language_model.model.layers.32.self_attn.o_proj.weight', 'language_model.model.layers.32.self_attn.q_proj.weight', 'language_model.model.layers.32.self_attn.v_proj.weight', 'language_model.model.layers.33.input_layernorm.weight', 'language_model.model.layers.33.mlp.down_proj.weight', 'language_model.model.layers.33.mlp.gate_proj.weight', 'language_model.model.layers.33.mlp.up_proj.weight', 'language_model.model.layers.33.post_attention_layernorm.weight', 'language_model.model.layers.33.self_attn.k_proj.weight', 'language_model.model.layers.33.self_attn.o_proj.weight', 'language_model.model.layers.33.self_attn.q_proj.weight', 'language_model.model.layers.33.self_attn.v_proj.weight', 'language_model.model.layers.34.input_layernorm.weight', 'language_model.model.layers.34.mlp.down_proj.weight', 'language_model.model.layers.34.mlp.gate_proj.weight', 'language_model.model.layers.34.mlp.up_proj.weight', 'language_model.model.layers.34.post_attention_layernorm.weight', 'language_model.model.layers.34.self_attn.k_proj.weight', 'language_model.model.layers.34.self_attn.o_proj.weight', 'language_model.model.layers.34.self_attn.q_proj.weight', 'language_model.model.layers.34.self_attn.v_proj.weight', 'language_model.model.layers.35.input_layernorm.weight', 'language_model.model.layers.35.mlp.down_proj.weight', 'language_model.model.layers.35.mlp.gate_proj.weight', 'language_model.model.layers.35.mlp.up_proj.weight', 'language_model.model.layers.35.post_attention_layernorm.weight', 'language_model.model.layers.35.self_attn.k_proj.weight', 'language_model.model.layers.35.self_attn.o_proj.weight', 'language_model.model.layers.35.self_attn.q_proj.weight', 'language_model.model.layers.35.self_attn.v_proj.weight', 'language_model.model.layers.36.input_layernorm.weight', 'language_model.model.layers.36.mlp.down_proj.weight', 'language_model.model.layers.36.mlp.gate_proj.weight', 'language_model.model.layers.36.mlp.up_proj.weight', 'language_model.model.layers.36.post_attention_layernorm.weight', 'language_model.model.layers.36.self_attn.k_proj.weight', 'language_model.model.layers.36.self_attn.o_proj.weight', 'language_model.model.layers.36.self_attn.q_proj.weight', 'language_model.model.layers.36.self_attn.v_proj.weight', 'language_model.model.layers.37.input_layernorm.weight', 'language_model.model.layers.37.mlp.down_proj.weight', 'language_model.model.layers.37.mlp.gate_proj.weight', 'language_model.model.layers.37.mlp.up_proj.weight', 'language_model.model.layers.37.post_attention_layernorm.weight', 'language_model.model.layers.37.self_attn.k_proj.weight', 'language_model.model.layers.37.self_attn.o_proj.weight', 'language_model.model.layers.37.self_attn.q_proj.weight', 'language_model.model.layers.37.self_attn.v_proj.weight', 'language_model.model.layers.38.input_layernorm.weight', 'language_model.model.layers.38.mlp.down_proj.weight', 'language_model.model.layers.38.mlp.gate_proj.weight', 'language_model.model.layers.38.mlp.up_proj.weight', 'language_model.model.layers.38.post_attention_layernorm.weight', 'language_model.model.layers.38.self_attn.k_proj.weight', 'language_model.model.layers.38.self_attn.o_proj.weight', 'language_model.model.layers.38.self_attn.q_proj.weight', 'language_model.model.layers.38.self_attn.v_proj.weight', 'language_model.model.layers.39.input_layernorm.weight', 'language_model.model.layers.39.mlp.down_proj.weight', 'language_model.model.layers.39.mlp.gate_proj.weight', 'language_model.model.layers.39.mlp.up_proj.weight', 'language_model.model.layers.39.post_attention_layernorm.weight', 'language_model.model.layers.39.self_attn.k_proj.weight', 'language_model.model.layers.39.self_attn.o_proj.weight', 'language_model.model.layers.39.self_attn.q_proj.weight', 'language_model.model.layers.39.self_attn.v_proj.weight', 'language_model.model.layers.4.input_layernorm.weight', 'language_model.model.layers.4.mlp.down_proj.weight', 'language_model.model.layers.4.mlp.gate_proj.weight', 'language_model.model.layers.4.mlp.up_proj.weight', 'language_model.model.layers.4.post_attention_layernorm.weight', 'language_model.model.layers.4.self_attn.k_proj.weight', 'language_model.model.layers.4.self_attn.o_proj.weight', 'language_model.model.layers.4.self_attn.q_proj.weight', 'language_model.model.layers.4.self_attn.v_proj.weight', 'language_model.model.layers.5.input_layernorm.weight', 'language_model.model.layers.5.mlp.down_proj.weight', 'language_model.model.layers.5.mlp.gate_proj.weight', 'language_model.model.layers.5.mlp.up_proj.weight', 'language_model.model.layers.5.post_attention_layernorm.weight', 'language_model.model.layers.5.self_attn.k_proj.weight', 'language_model.model.layers.5.self_attn.o_proj.weight', 'language_model.model.layers.5.self_attn.q_proj.weight', 'language_model.model.layers.5.self_attn.v_proj.weight', 'language_model.model.layers.6.input_layernorm.weight', 'language_model.model.layers.6.mlp.down_proj.weight', 'language_model.model.layers.6.mlp.gate_proj.weight', 'language_model.model.layers.6.mlp.up_proj.weight', 'language_model.model.layers.6.post_attention_layernorm.weight', 'language_model.model.layers.6.self_attn.k_proj.weight', 'language_model.model.layers.6.self_attn.o_proj.weight', 'language_model.model.layers.6.self_attn.q_proj.weight', 'language_model.model.layers.6.self_attn.v_proj.weight', 'language_model.model.layers.7.input_layernorm.weight', 'language_model.model.layers.7.mlp.down_proj.weight', 'language_model.model.layers.7.mlp.gate_proj.weight', 'language_model.model.layers.7.mlp.up_proj.weight', 'language_model.model.layers.7.post_attention_layernorm.weight', 'language_model.model.layers.7.self_attn.k_proj.weight', 'language_model.model.layers.7.self_attn.o_proj.weight', 'language_model.model.layers.7.self_attn.q_proj.weight', 'language_model.model.layers.7.self_attn.v_proj.weight', 'language_model.model.layers.8.input_layernorm.weight', 'language_model.model.layers.8.mlp.down_proj.weight', 'language_model.model.layers.8.mlp.gate_proj.weight', 'language_model.model.layers.8.mlp.up_proj.weight', 'language_model.model.layers.8.post_attention_layernorm.weight', 'language_model.model.layers.8.self_attn.k_proj.weight', 'language_model.model.layers.8.self_attn.o_proj.weight', 'language_model.model.layers.8.self_attn.q_proj.weight', 'language_model.model.layers.8.self_attn.v_proj.weight', 'language_model.model.layers.9.input_layernorm.weight', 'language_model.model.layers.9.mlp.down_proj.weight', 'language_model.model.layers.9.mlp.gate_proj.weight', 'language_model.model.layers.9.mlp.up_proj.weight', 'language_model.model.layers.9.post_attention_layernorm.weight', 'language_model.model.layers.9.self_attn.k_proj.weight', 'language_model.model.layers.9.self_attn.o_proj.weight', 'language_model.model.layers.9.self_attn.q_proj.weight', 'language_model.model.layers.9.self_attn.v_proj.weight', 'language_model.model.norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Model loaded
Loading processor from /home/PLLaVA/MODELS/pllava-13b
Processor loaded
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Input video shape: 4 1280 720
img_list shape: [(1280, 720), (1280, 720), (1280, 720), (1280, 720)]
Uploaded 1 images for processing.
Generated Prompt: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
-
Follow the user's instruction.
-
Be critical yet believe in yourself.
USER: ![]()
USER: USER: describe the video ASSISTANT:
Processed inputs keys: dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
Pixel values shape: torch.Size([4, 3, 336, 336])
Pixel values: tensor([[[[-1.1061, -1.1061, -1.1353, ..., -0.2010, -0.2302, -0.2448],
[-1.1353, -1.1207, -1.1353, ..., -0.2594, -0.2740, -0.2594],
[-1.1353, -1.1353, -1.1353, ..., -0.0842, -0.2448, -0.2886],
...,
[-0.3908, -0.2302, -0.1280, ..., 0.0325, 0.0179, 0.0179],
[-0.3470, -0.1864, -0.0988, ..., 0.0325, 0.0179, 0.0179],
[-0.3178, -0.1426, -0.0550, ..., 0.0325, 0.0179, 0.0179]],
[[-0.6715, -0.6715, -0.7016, ..., 0.3190, 0.3040, 0.2890],
[-0.7016, -0.6865, -0.7016, ..., 0.2740, 0.2589, 0.2740],
[-0.7016, -0.7016, -0.7016, ..., 0.4540, 0.3040, 0.2589],
...,
[-0.2213, -0.0562, 0.0488, ..., -0.2063, -0.2063, -0.2063],
[-0.1763, -0.0112, 0.0789, ..., -0.2063, -0.2063, -0.2063],
[-0.1463, 0.0338, 0.1239, ..., -0.2063, -0.2063, -0.2063]],
[[-0.3000, -0.3000, -0.3284, ..., 0.4395, 0.4110, 0.3684],
[-0.3284, -0.3142, -0.3284, ..., 0.3542, 0.2973, 0.2973],
[-0.3284, -0.3284, -0.3284, ..., 0.4964, 0.2831, 0.2262],
...,
[-0.2004, -0.0156, 0.0982, ..., -0.1435, -0.1720, -0.1720],
[-0.1720, 0.0271, 0.1266, ..., -0.1435, -0.1720, -0.1720],
[-0.1293, 0.0698, 0.1693, ..., -0.1435, -0.1720, -0.1720]]],
[[[ 0.6603, 0.7041, 0.8647, ..., 0.6311, 0.6603, 0.7771],
[ 0.4851, 0.5873, 0.7041, ..., 0.5727, 0.5289, 0.6895],
[ 0.4997, 0.8501, 0.5289, ..., 0.5873, 0.3099, 0.5143],
...,
[ 0.8501, 1.2734, 1.1712, ..., -0.1864, -0.7266, -1.3397],
[ 0.6749, 1.2734, 1.2150, ..., 0.6457, 0.0325, -0.5660],
[ 0.5581, 0.9230, 0.8792, ..., 1.2734, 1.2588, 1.1420]],
[[ 0.7692, 0.8142, 0.9793, ..., 0.6041, 0.6341, 0.7542],
[ 0.5891, 0.6942, 0.8142, ..., 0.5441, 0.4991, 0.6642],
[ 0.6041, 0.9643, 0.6341, ..., 0.5591, 0.2740, 0.4991],
...,
[ 0.9193, 1.3545, 1.2495, ..., -0.1012, -0.6415, -1.2568],
[ 0.7392, 1.3545, 1.2945, ..., 0.7542, 0.1239, -0.4764],
[ 0.6191, 0.9943, 0.9493, ..., 1.4145, 1.3845, 1.2945]],
[[ 0.6812, 0.7239, 0.8803, ..., 0.4964, 0.5532, 0.7239],
[ 0.5106, 0.6101, 0.7239, ..., 0.4395, 0.4253, 0.6244],
[ 0.5248, 0.8661, 0.5532, ..., 0.4537, 0.2120, 0.4679],
...,
[ 0.8661, 1.2785, 1.1789, ..., -0.1435, -0.6128, -1.1958],
[ 0.6955, 1.2785, 1.2216, ..., 0.6812, 0.1266, -0.4422],
[ 0.5817, 0.9372, 0.8945, ..., 1.3211, 1.3211, 1.2358]]],
[[[ 0.0179, -0.2448, -0.4346, ..., 0.0763, 0.2953, 0.3391],
[ 0.2077, -0.1280, -0.3032, ..., -0.1718, -0.0405, 0.3537],
[ 0.2369, 0.2369, 0.1347, ..., 0.5143, 0.0617, 0.2515],
...,
[-0.9164, -0.5368, 0.4267, ..., -0.2886, -0.1572, 0.2077],
[-0.5076, -0.2010, 0.4705, ..., -0.2448, 0.2077, 0.0909],
[-0.1864, 0.0471, 0.4413, ..., 0.1055, 0.2807, -0.3324]],
[[ 0.3040, 0.0338, -0.1613, ..., 0.0638, 0.2890, 0.3190],
[ 0.4991, 0.1539, -0.0262, ..., -0.1913, -0.0562, 0.3490],
[ 0.5291, 0.5291, 0.4240, ..., 0.5141, 0.0488, 0.2289],
...,
[-0.9117, -0.5215, 0.4691, ..., -0.1012, 0.0488, 0.4390],
[-0.4914, -0.1763, 0.5141, ..., -0.0862, 0.4090, 0.3040],
[-0.1613, 0.0789, 0.4841, ..., 0.2740, 0.4841, -0.1313]],
[[ 0.2688, 0.0129, -0.1720, ..., 0.0129, 0.2120, 0.2831],
[ 0.4537, 0.1266, -0.0440, ..., -0.2146, -0.1151, 0.2973],
[ 0.4821, 0.4821, 0.3826, ..., 0.4537, -0.0156, 0.1977],
...,
[-0.7977, -0.4279, 0.5106, ..., -0.0724, 0.0840, 0.4537],
[-0.3995, -0.1009, 0.5532, ..., -0.0582, 0.4253, 0.2973],
[-0.0867, 0.1409, 0.5248, ..., 0.2831, 0.4537, -0.1435]]],
[[[ 0.5873, 0.0763, 0.0033, ..., 0.1785, 0.1785, 0.5289],
[ 0.4705, 0.1931, 0.3829, ..., 0.0179, -0.3324, 0.3975],
[ 0.3099, 0.2807, 0.4705, ..., 0.7041, 0.3099, 0.8209],
...,
[ 0.8063, 0.6895, 0.6457, ..., 0.5581, 0.7333, 0.8501],
[ 0.6165, 0.5727, 0.7917, ..., 0.6311, 1.0252, 1.2880],
[ 0.7771, 0.7625, 0.8792, ..., 0.4267, 0.9668, 1.2588]],
[[ 0.6191, 0.0939, 0.0188, ..., 0.1689, 0.1689, 0.5291],
[ 0.4991, 0.2139, 0.4090, ..., 0.0038, -0.3564, 0.3940],
[ 0.3340, 0.3040, 0.4991, ..., 0.7092, 0.3040, 0.8292],
...,
[ 1.1144, 0.9943, 1.0093, ..., 0.5291, 0.7242, 0.8743],
[ 0.9043, 0.8743, 1.1594, ..., 0.6041, 1.0393, 1.3395],
[ 1.0694, 1.0694, 1.2344, ..., 0.3940, 0.9793, 1.3095]],
[[ 0.5959, 0.0982, 0.0271, ..., 0.0555, 0.0555, 0.3968],
[ 0.4821, 0.2120, 0.3968, ..., -0.1009, -0.4422, 0.2688],
[ 0.3257, 0.2973, 0.4821, ..., 0.5675, 0.1835, 0.6812],
...,
[ 1.3496, 1.2358, 1.2216, ..., 0.4110, 0.6101, 0.7666],
[ 1.1505, 1.1221, 1.3638, ..., 0.4821, 0.8803, 1.1932],
[ 1.3069, 1.3069, 1.4491, ..., 0.2831, 0.8234, 1.1647]]]])
Input IDs shape: torch.Size([1, 140])
Input IDs: tensor([[ 1, 887, 526, 263, 13988, 13987, 26494, 678, 271, 29933,
327, 29892, 263, 2919, 18551, 29899, 11675, 20255, 29889, 29871,
13, 3492, 526, 2221, 304, 2274, 278, 4863, 2793, 393,
278, 1404, 8128, 322, 6985, 278, 1404, 297, 263, 4863,
29899, 11675, 4475, 3414, 29889, 13, 1576, 1404, 1795, 3867,
366, 411, 278, 4863, 322, 5505, 777, 4805, 694, 13344,
2472, 304, 1371, 366, 714, 470, 2244, 366, 263, 1139,
29889, 8561, 671, 310, 278, 2472, 297, 263, 1571, 982,
304, 367, 5100, 296, 363, 278, 4982, 29889, 13, 2277,
29937, 2672, 10810, 29965, 9838, 29903, 29901, 13, 29896, 29889,
10306, 278, 1404, 29915, 29879, 15278, 29889, 13, 29906, 29889,
1522, 12187, 3447, 4658, 297, 7535, 29889, 13, 3148, 1001,
29901, 29871, 32000, 29871, 13, 3148, 1001, 29901, 29871, 3148,
1001, 29901, 8453, 278, 4863, 319, 1799, 9047, 13566, 29901]])
###PROMPT: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
- Follow the user's instruction.
- Be critical yet believe in yourself.
USER: ![]()
USER: USER: describe the video ASSISTANT:
###LM OUTPUT TEXT You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
- Follow the user's instruction.
- Be critical yet believe in yourself.
USER:
USER: USER: describe the video ASSISTANT:
Question: describe the video
Answer:
I have build my own demo file. after uploading one video, it gives blank output. Could anyone help me out?
-------------------------------Here's the demo file-------------------------
from argparse import ArgumentParser
from tasks.eval.model_utils import load_pllava
from tasks.eval.eval_utils import ChatPllava, conv_plain_v1, conv_templates
SYSTEM = """You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
"""
INIT_CONVERSATION = conv_plain_v1.copy()
def init_model(args):
print('Initializing PLLaVA')
model, processor = load_pllava(
args.pretrained_model_name_or_path, args.num_frames,
use_lora=args.use_lora,
weight_dir=args.weight_dir,
lora_alpha=args.lora_alpha,
use_multi_gpus=args.use_multi_gpus)
if not args.use_multi_gpus:
model = model.to('cuda')
chat = ChatPllava(model, processor)
return chat
def run_inference(video_path, question, chat, num_beams=1, temperature=1.0):
# Upload the video
llm_message, img_list, chat_state = chat.upload_video(video_path, INIT_CONVERSATION.copy(), [])
def parse_args():
parser = ArgumentParser()
parser.add_argument("--pretrained_model_name_or_path", type=str, required=False, default='/home/PLLaVA/MODELS/pllava-13b')
parser.add_argument("--num_frames", type=int, required=False, default=4)
parser.add_argument("--use_lora", action='store_true')
parser.add_argument("--use_multi_gpus", action='store_true')
parser.add_argument("--lora_alpha", type=int, default=None, help="LoRA alpha parameter.")
parser.add_argument("--weight_dir", type=str, required=False, default=None)
parser.add_argument("--conv_mode", type=str, required=False, default='eval_vcgbench')
parser.add_argument("--video_path", type=str, required=False, default="/home/PLLaVA/EDATA/VCGBench/Test_Videos/v_-D1gdv_gQyw.mp4")
parser.add_argument("--question", type=str, required=False, default="describe the video")
parser.add_argument("--num_beams", type=int, default=1, help="Beam search numbers.")
parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for text generation.")
args = parser.parse_args()
return args
if name == "main":
args = parse_args()
-------------------------------Here's the output-------------------------------
Initializing PLLaVA
Loading model from /home/PLLaVA/MODELS/pllava-13b
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 6/6 [00:20<00:00, 3.38s/it]
Some weights of PllavaForConditionalGeneration were not initialized from the model checkpoint at /home/PLLaVA/MODELS/pllava-13b and are newly initialized: ['language_model.lm_head.weight', 'language_model.model.embed_tokens.weight', 'language_model.model.layers.0.input_layernorm.weight', 'language_model.model.layers.0.mlp.down_proj.weight', 'language_model.model.layers.0.mlp.gate_proj.weight', 'language_model.model.layers.0.mlp.up_proj.weight', 'language_model.model.layers.0.post_attention_layernorm.weight', 'language_model.model.layers.0.self_attn.k_proj.weight', 'language_model.model.layers.0.self_attn.o_proj.weight', 'language_model.model.layers.0.self_attn.q_proj.weight', 'language_model.model.layers.0.self_attn.v_proj.weight', 'language_model.model.layers.1.input_layernorm.weight', 'language_model.model.layers.1.mlp.down_proj.weight', 'language_model.model.layers.1.mlp.gate_proj.weight', 'language_model.model.layers.1.mlp.up_proj.weight', 'language_model.model.layers.1.post_attention_layernorm.weight', 'language_model.model.layers.1.self_attn.k_proj.weight', 'language_model.model.layers.1.self_attn.o_proj.weight', 'language_model.model.layers.1.self_attn.q_proj.weight', 'language_model.model.layers.1.self_attn.v_proj.weight', 'language_model.model.layers.10.input_layernorm.weight', 'language_model.model.layers.10.mlp.down_proj.weight', 'language_model.model.layers.10.mlp.gate_proj.weight', 'language_model.model.layers.10.mlp.up_proj.weight', 'language_model.model.layers.10.post_attention_layernorm.weight', 'language_model.model.layers.10.self_attn.k_proj.weight', 'language_model.model.layers.10.self_attn.o_proj.weight', 'language_model.model.layers.10.self_attn.q_proj.weight', 'language_model.model.layers.10.self_attn.v_proj.weight', 'language_model.model.layers.11.input_layernorm.weight', 'language_model.model.layers.11.mlp.down_proj.weight', 'language_model.model.layers.11.mlp.gate_proj.weight', 'language_model.model.layers.11.mlp.up_proj.weight', 'language_model.model.layers.11.post_attention_layernorm.weight', 'language_model.model.layers.11.self_attn.k_proj.weight', 'language_model.model.layers.11.self_attn.o_proj.weight', 'language_model.model.layers.11.self_attn.q_proj.weight', 'language_model.model.layers.11.self_attn.v_proj.weight', 'language_model.model.layers.12.input_layernorm.weight', 'language_model.model.layers.12.mlp.down_proj.weight', 'language_model.model.layers.12.mlp.gate_proj.weight', 'language_model.model.layers.12.mlp.up_proj.weight', 'language_model.model.layers.12.post_attention_layernorm.weight', 'language_model.model.layers.12.self_attn.k_proj.weight', 'language_model.model.layers.12.self_attn.o_proj.weight', 'language_model.model.layers.12.self_attn.q_proj.weight', 'language_model.model.layers.12.self_attn.v_proj.weight', 'language_model.model.layers.13.input_layernorm.weight', 'language_model.model.layers.13.mlp.down_proj.weight', 'language_model.model.layers.13.mlp.gate_proj.weight', 'language_model.model.layers.13.mlp.up_proj.weight', 'language_model.model.layers.13.post_attention_layernorm.weight', 'language_model.model.layers.13.self_attn.k_proj.weight', 'language_model.model.layers.13.self_attn.o_proj.weight', 'language_model.model.layers.13.self_attn.q_proj.weight', 'language_model.model.layers.13.self_attn.v_proj.weight', 'language_model.model.layers.14.input_layernorm.weight', 'language_model.model.layers.14.mlp.down_proj.weight', 'language_model.model.layers.14.mlp.gate_proj.weight', 'language_model.model.layers.14.mlp.up_proj.weight', 'language_model.model.layers.14.post_attention_layernorm.weight', 'language_model.model.layers.14.self_attn.k_proj.weight', 'language_model.model.layers.14.self_attn.o_proj.weight', 'language_model.model.layers.14.self_attn.q_proj.weight', 'language_model.model.layers.14.self_attn.v_proj.weight', 'language_model.model.layers.15.input_layernorm.weight', 'language_model.model.layers.15.mlp.down_proj.weight', 'language_model.model.layers.15.mlp.gate_proj.weight', 'language_model.model.layers.15.mlp.up_proj.weight', 'language_model.model.layers.15.post_attention_layernorm.weight', 'language_model.model.layers.15.self_attn.k_proj.weight', 'language_model.model.layers.15.self_attn.o_proj.weight', 'language_model.model.layers.15.self_attn.q_proj.weight', 'language_model.model.layers.15.self_attn.v_proj.weight', 'language_model.model.layers.16.input_layernorm.weight', 'language_model.model.layers.16.mlp.down_proj.weight', 'language_model.model.layers.16.mlp.gate_proj.weight', 'language_model.model.layers.16.mlp.up_proj.weight', 'language_model.model.layers.16.post_attention_layernorm.weight', 'language_model.model.layers.16.self_attn.k_proj.weight', 'language_model.model.layers.16.self_attn.o_proj.weight', 'language_model.model.layers.16.self_attn.q_proj.weight', 'language_model.model.layers.16.self_attn.v_proj.weight', 'language_model.model.layers.17.input_layernorm.weight', 'language_model.model.layers.17.mlp.down_proj.weight', 'language_model.model.layers.17.mlp.gate_proj.weight', 'language_model.model.layers.17.mlp.up_proj.weight', 'language_model.model.layers.17.post_attention_layernorm.weight', 'language_model.model.layers.17.self_attn.k_proj.weight', 'language_model.model.layers.17.self_attn.o_proj.weight', 'language_model.model.layers.17.self_attn.q_proj.weight', 'language_model.model.layers.17.self_attn.v_proj.weight', 'language_model.model.layers.18.input_layernorm.weight', 'language_model.model.layers.18.mlp.down_proj.weight', 'language_model.model.layers.18.mlp.gate_proj.weight', 'language_model.model.layers.18.mlp.up_proj.weight', 'language_model.model.layers.18.post_attention_layernorm.weight', 'language_model.model.layers.18.self_attn.k_proj.weight', 'language_model.model.layers.18.self_attn.o_proj.weight', 'language_model.model.layers.18.self_attn.q_proj.weight', 'language_model.model.layers.18.self_attn.v_proj.weight', 'language_model.model.layers.19.input_layernorm.weight', 'language_model.model.layers.19.mlp.down_proj.weight', 'language_model.model.layers.19.mlp.gate_proj.weight', 'language_model.model.layers.19.mlp.up_proj.weight', 'language_model.model.layers.19.post_attention_layernorm.weight', 'language_model.model.layers.19.self_attn.k_proj.weight', 'language_model.model.layers.19.self_attn.o_proj.weight', 'language_model.model.layers.19.self_attn.q_proj.weight', 'language_model.model.layers.19.self_attn.v_proj.weight', 'language_model.model.layers.2.input_layernorm.weight', 'language_model.model.layers.2.mlp.down_proj.weight', 'language_model.model.layers.2.mlp.gate_proj.weight', 'language_model.model.layers.2.mlp.up_proj.weight', 'language_model.model.layers.2.post_attention_layernorm.weight', 'language_model.model.layers.2.self_attn.k_proj.weight', 'language_model.model.layers.2.self_attn.o_proj.weight', 'language_model.model.layers.2.self_attn.q_proj.weight', 'language_model.model.layers.2.self_attn.v_proj.weight', 'language_model.model.layers.20.input_layernorm.weight', 'language_model.model.layers.20.mlp.down_proj.weight', 'language_model.model.layers.20.mlp.gate_proj.weight', 'language_model.model.layers.20.mlp.up_proj.weight', 'language_model.model.layers.20.post_attention_layernorm.weight', 'language_model.model.layers.20.self_attn.k_proj.weight', 'language_model.model.layers.20.self_attn.o_proj.weight', 'language_model.model.layers.20.self_attn.q_proj.weight', 'language_model.model.layers.20.self_attn.v_proj.weight', 'language_model.model.layers.21.input_layernorm.weight', 'language_model.model.layers.21.mlp.down_proj.weight', 'language_model.model.layers.21.mlp.gate_proj.weight', 'language_model.model.layers.21.mlp.up_proj.weight', 'language_model.model.layers.21.post_attention_layernorm.weight', 'language_model.model.layers.21.self_attn.k_proj.weight', 'language_model.model.layers.21.self_attn.o_proj.weight', 'language_model.model.layers.21.self_attn.q_proj.weight', 'language_model.model.layers.21.self_attn.v_proj.weight', 'language_model.model.layers.22.input_layernorm.weight', 'language_model.model.layers.22.mlp.down_proj.weight', 'language_model.model.layers.22.mlp.gate_proj.weight', 'language_model.model.layers.22.mlp.up_proj.weight', 'language_model.model.layers.22.post_attention_layernorm.weight', 'language_model.model.layers.22.self_attn.k_proj.weight', 'language_model.model.layers.22.self_attn.o_proj.weight', 'language_model.model.layers.22.self_attn.q_proj.weight', 'language_model.model.layers.22.self_attn.v_proj.weight', 'language_model.model.layers.23.input_layernorm.weight', 'language_model.model.layers.23.mlp.down_proj.weight', 'language_model.model.layers.23.mlp.gate_proj.weight', 'language_model.model.layers.23.mlp.up_proj.weight', 'language_model.model.layers.23.post_attention_layernorm.weight', 'language_model.model.layers.23.self_attn.k_proj.weight', 'language_model.model.layers.23.self_attn.o_proj.weight', 'language_model.model.layers.23.self_attn.q_proj.weight', 'language_model.model.layers.23.self_attn.v_proj.weight', 'language_model.model.layers.24.input_layernorm.weight', 'language_model.model.layers.24.mlp.down_proj.weight', 'language_model.model.layers.24.mlp.gate_proj.weight', 'language_model.model.layers.24.mlp.up_proj.weight', 'language_model.model.layers.24.post_attention_layernorm.weight', 'language_model.model.layers.24.self_attn.k_proj.weight', 'language_model.model.layers.24.self_attn.o_proj.weight', 'language_model.model.layers.24.self_attn.q_proj.weight', 'language_model.model.layers.24.self_attn.v_proj.weight', 'language_model.model.layers.25.input_layernorm.weight', 'language_model.model.layers.25.mlp.down_proj.weight', 'language_model.model.layers.25.mlp.gate_proj.weight', 'language_model.model.layers.25.mlp.up_proj.weight', 'language_model.model.layers.25.post_attention_layernorm.weight', 'language_model.model.layers.25.self_attn.k_proj.weight', 'language_model.model.layers.25.self_attn.o_proj.weight', 'language_model.model.layers.25.self_attn.q_proj.weight', 'language_model.model.layers.25.self_attn.v_proj.weight', 'language_model.model.layers.26.input_layernorm.weight', 'language_model.model.layers.26.mlp.down_proj.weight', 'language_model.model.layers.26.mlp.gate_proj.weight', 'language_model.model.layers.26.mlp.up_proj.weight', 'language_model.model.layers.26.post_attention_layernorm.weight', 'language_model.model.layers.26.self_attn.k_proj.weight', 'language_model.model.layers.26.self_attn.o_proj.weight', 'language_model.model.layers.26.self_attn.q_proj.weight', 'language_model.model.layers.26.self_attn.v_proj.weight', 'language_model.model.layers.27.input_layernorm.weight', 'language_model.model.layers.27.mlp.down_proj.weight', 'language_model.model.layers.27.mlp.gate_proj.weight', 'language_model.model.layers.27.mlp.up_proj.weight', 'language_model.model.layers.27.post_attention_layernorm.weight', 'language_model.model.layers.27.self_attn.k_proj.weight', 'language_model.model.layers.27.self_attn.o_proj.weight', 'language_model.model.layers.27.self_attn.q_proj.weight', 'language_model.model.layers.27.self_attn.v_proj.weight', 'language_model.model.layers.28.input_layernorm.weight', 'language_model.model.layers.28.mlp.down_proj.weight', 'language_model.model.layers.28.mlp.gate_proj.weight', 'language_model.model.layers.28.mlp.up_proj.weight', 'language_model.model.layers.28.post_attention_layernorm.weight', 'language_model.model.layers.28.self_attn.k_proj.weight', 'language_model.model.layers.28.self_attn.o_proj.weight', 'language_model.model.layers.28.self_attn.q_proj.weight', 'language_model.model.layers.28.self_attn.v_proj.weight', 'language_model.model.layers.29.input_layernorm.weight', 'language_model.model.layers.29.mlp.down_proj.weight', 'language_model.model.layers.29.mlp.gate_proj.weight', 'language_model.model.layers.29.mlp.up_proj.weight', 'language_model.model.layers.29.post_attention_layernorm.weight', 'language_model.model.layers.29.self_attn.k_proj.weight', 'language_model.model.layers.29.self_attn.o_proj.weight', 'language_model.model.layers.29.self_attn.q_proj.weight', 'language_model.model.layers.29.self_attn.v_proj.weight', 'language_model.model.layers.3.input_layernorm.weight', 'language_model.model.layers.3.mlp.down_proj.weight', 'language_model.model.layers.3.mlp.gate_proj.weight', 'language_model.model.layers.3.mlp.up_proj.weight', 'language_model.model.layers.3.post_attention_layernorm.weight', 'language_model.model.layers.3.self_attn.k_proj.weight', 'language_model.model.layers.3.self_attn.o_proj.weight', 'language_model.model.layers.3.self_attn.q_proj.weight', 'language_model.model.layers.3.self_attn.v_proj.weight', 'language_model.model.layers.30.input_layernorm.weight', 'language_model.model.layers.30.mlp.down_proj.weight', 'language_model.model.layers.30.mlp.gate_proj.weight', 'language_model.model.layers.30.mlp.up_proj.weight', 'language_model.model.layers.30.post_attention_layernorm.weight', 'language_model.model.layers.30.self_attn.k_proj.weight', 'language_model.model.layers.30.self_attn.o_proj.weight', 'language_model.model.layers.30.self_attn.q_proj.weight', 'language_model.model.layers.30.self_attn.v_proj.weight', 'language_model.model.layers.31.input_layernorm.weight', 'language_model.model.layers.31.mlp.down_proj.weight', 'language_model.model.layers.31.mlp.gate_proj.weight', 'language_model.model.layers.31.mlp.up_proj.weight', 'language_model.model.layers.31.post_attention_layernorm.weight', 'language_model.model.layers.31.self_attn.k_proj.weight', 'language_model.model.layers.31.self_attn.o_proj.weight', 'language_model.model.layers.31.self_attn.q_proj.weight', 'language_model.model.layers.31.self_attn.v_proj.weight', 'language_model.model.layers.32.input_layernorm.weight', 'language_model.model.layers.32.mlp.down_proj.weight', 'language_model.model.layers.32.mlp.gate_proj.weight', 'language_model.model.layers.32.mlp.up_proj.weight', 'language_model.model.layers.32.post_attention_layernorm.weight', 'language_model.model.layers.32.self_attn.k_proj.weight', 'language_model.model.layers.32.self_attn.o_proj.weight', 'language_model.model.layers.32.self_attn.q_proj.weight', 'language_model.model.layers.32.self_attn.v_proj.weight', 'language_model.model.layers.33.input_layernorm.weight', 'language_model.model.layers.33.mlp.down_proj.weight', 'language_model.model.layers.33.mlp.gate_proj.weight', 'language_model.model.layers.33.mlp.up_proj.weight', 'language_model.model.layers.33.post_attention_layernorm.weight', 'language_model.model.layers.33.self_attn.k_proj.weight', 'language_model.model.layers.33.self_attn.o_proj.weight', 'language_model.model.layers.33.self_attn.q_proj.weight', 'language_model.model.layers.33.self_attn.v_proj.weight', 'language_model.model.layers.34.input_layernorm.weight', 'language_model.model.layers.34.mlp.down_proj.weight', 'language_model.model.layers.34.mlp.gate_proj.weight', 'language_model.model.layers.34.mlp.up_proj.weight', 'language_model.model.layers.34.post_attention_layernorm.weight', 'language_model.model.layers.34.self_attn.k_proj.weight', 'language_model.model.layers.34.self_attn.o_proj.weight', 'language_model.model.layers.34.self_attn.q_proj.weight', 'language_model.model.layers.34.self_attn.v_proj.weight', 'language_model.model.layers.35.input_layernorm.weight', 'language_model.model.layers.35.mlp.down_proj.weight', 'language_model.model.layers.35.mlp.gate_proj.weight', 'language_model.model.layers.35.mlp.up_proj.weight', 'language_model.model.layers.35.post_attention_layernorm.weight', 'language_model.model.layers.35.self_attn.k_proj.weight', 'language_model.model.layers.35.self_attn.o_proj.weight', 'language_model.model.layers.35.self_attn.q_proj.weight', 'language_model.model.layers.35.self_attn.v_proj.weight', 'language_model.model.layers.36.input_layernorm.weight', 'language_model.model.layers.36.mlp.down_proj.weight', 'language_model.model.layers.36.mlp.gate_proj.weight', 'language_model.model.layers.36.mlp.up_proj.weight', 'language_model.model.layers.36.post_attention_layernorm.weight', 'language_model.model.layers.36.self_attn.k_proj.weight', 'language_model.model.layers.36.self_attn.o_proj.weight', 'language_model.model.layers.36.self_attn.q_proj.weight', 'language_model.model.layers.36.self_attn.v_proj.weight', 'language_model.model.layers.37.input_layernorm.weight', 'language_model.model.layers.37.mlp.down_proj.weight', 'language_model.model.layers.37.mlp.gate_proj.weight', 'language_model.model.layers.37.mlp.up_proj.weight', 'language_model.model.layers.37.post_attention_layernorm.weight', 'language_model.model.layers.37.self_attn.k_proj.weight', 'language_model.model.layers.37.self_attn.o_proj.weight', 'language_model.model.layers.37.self_attn.q_proj.weight', 'language_model.model.layers.37.self_attn.v_proj.weight', 'language_model.model.layers.38.input_layernorm.weight', 'language_model.model.layers.38.mlp.down_proj.weight', 'language_model.model.layers.38.mlp.gate_proj.weight', 'language_model.model.layers.38.mlp.up_proj.weight', 'language_model.model.layers.38.post_attention_layernorm.weight', 'language_model.model.layers.38.self_attn.k_proj.weight', 'language_model.model.layers.38.self_attn.o_proj.weight', 'language_model.model.layers.38.self_attn.q_proj.weight', 'language_model.model.layers.38.self_attn.v_proj.weight', 'language_model.model.layers.39.input_layernorm.weight', 'language_model.model.layers.39.mlp.down_proj.weight', 'language_model.model.layers.39.mlp.gate_proj.weight', 'language_model.model.layers.39.mlp.up_proj.weight', 'language_model.model.layers.39.post_attention_layernorm.weight', 'language_model.model.layers.39.self_attn.k_proj.weight', 'language_model.model.layers.39.self_attn.o_proj.weight', 'language_model.model.layers.39.self_attn.q_proj.weight', 'language_model.model.layers.39.self_attn.v_proj.weight', 'language_model.model.layers.4.input_layernorm.weight', 'language_model.model.layers.4.mlp.down_proj.weight', 'language_model.model.layers.4.mlp.gate_proj.weight', 'language_model.model.layers.4.mlp.up_proj.weight', 'language_model.model.layers.4.post_attention_layernorm.weight', 'language_model.model.layers.4.self_attn.k_proj.weight', 'language_model.model.layers.4.self_attn.o_proj.weight', 'language_model.model.layers.4.self_attn.q_proj.weight', 'language_model.model.layers.4.self_attn.v_proj.weight', 'language_model.model.layers.5.input_layernorm.weight', 'language_model.model.layers.5.mlp.down_proj.weight', 'language_model.model.layers.5.mlp.gate_proj.weight', 'language_model.model.layers.5.mlp.up_proj.weight', 'language_model.model.layers.5.post_attention_layernorm.weight', 'language_model.model.layers.5.self_attn.k_proj.weight', 'language_model.model.layers.5.self_attn.o_proj.weight', 'language_model.model.layers.5.self_attn.q_proj.weight', 'language_model.model.layers.5.self_attn.v_proj.weight', 'language_model.model.layers.6.input_layernorm.weight', 'language_model.model.layers.6.mlp.down_proj.weight', 'language_model.model.layers.6.mlp.gate_proj.weight', 'language_model.model.layers.6.mlp.up_proj.weight', 'language_model.model.layers.6.post_attention_layernorm.weight', 'language_model.model.layers.6.self_attn.k_proj.weight', 'language_model.model.layers.6.self_attn.o_proj.weight', 'language_model.model.layers.6.self_attn.q_proj.weight', 'language_model.model.layers.6.self_attn.v_proj.weight', 'language_model.model.layers.7.input_layernorm.weight', 'language_model.model.layers.7.mlp.down_proj.weight', 'language_model.model.layers.7.mlp.gate_proj.weight', 'language_model.model.layers.7.mlp.up_proj.weight', 'language_model.model.layers.7.post_attention_layernorm.weight', 'language_model.model.layers.7.self_attn.k_proj.weight', 'language_model.model.layers.7.self_attn.o_proj.weight', 'language_model.model.layers.7.self_attn.q_proj.weight', 'language_model.model.layers.7.self_attn.v_proj.weight', 'language_model.model.layers.8.input_layernorm.weight', 'language_model.model.layers.8.mlp.down_proj.weight', 'language_model.model.layers.8.mlp.gate_proj.weight', 'language_model.model.layers.8.mlp.up_proj.weight', 'language_model.model.layers.8.post_attention_layernorm.weight', 'language_model.model.layers.8.self_attn.k_proj.weight', 'language_model.model.layers.8.self_attn.o_proj.weight', 'language_model.model.layers.8.self_attn.q_proj.weight', 'language_model.model.layers.8.self_attn.v_proj.weight', 'language_model.model.layers.9.input_layernorm.weight', 'language_model.model.layers.9.mlp.down_proj.weight', 'language_model.model.layers.9.mlp.gate_proj.weight', 'language_model.model.layers.9.mlp.up_proj.weight', 'language_model.model.layers.9.post_attention_layernorm.weight', 'language_model.model.layers.9.self_attn.k_proj.weight', 'language_model.model.layers.9.self_attn.o_proj.weight', 'language_model.model.layers.9.self_attn.q_proj.weight', 'language_model.model.layers.9.self_attn.v_proj.weight', 'language_model.model.norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Model loaded
Loading processor from /home/PLLaVA/MODELS/pllava-13b
Processor loaded
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Input video shape: 4 1280 720
img_list shape: [(1280, 720), (1280, 720), (1280, 720), (1280, 720)]
Uploaded 1 images for processing.
Generated Prompt: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
Follow the user's instruction.
Be critical yet believe in yourself.![]()
USER:
USER: USER: describe the video ASSISTANT:
Processed inputs keys: dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
Pixel values shape: torch.Size([4, 3, 336, 336])
Pixel values: tensor([[[[-1.1061, -1.1061, -1.1353, ..., -0.2010, -0.2302, -0.2448],
[-1.1353, -1.1207, -1.1353, ..., -0.2594, -0.2740, -0.2594],
[-1.1353, -1.1353, -1.1353, ..., -0.0842, -0.2448, -0.2886],
...,
[-0.3908, -0.2302, -0.1280, ..., 0.0325, 0.0179, 0.0179],
[-0.3470, -0.1864, -0.0988, ..., 0.0325, 0.0179, 0.0179],
[-0.3178, -0.1426, -0.0550, ..., 0.0325, 0.0179, 0.0179]],
Input IDs shape: torch.Size([1, 140])
Input IDs: tensor([[ 1, 887, 526, 263, 13988, 13987, 26494, 678, 271, 29933,
327, 29892, 263, 2919, 18551, 29899, 11675, 20255, 29889, 29871,
13, 3492, 526, 2221, 304, 2274, 278, 4863, 2793, 393,
278, 1404, 8128, 322, 6985, 278, 1404, 297, 263, 4863,
29899, 11675, 4475, 3414, 29889, 13, 1576, 1404, 1795, 3867,
366, 411, 278, 4863, 322, 5505, 777, 4805, 694, 13344,
2472, 304, 1371, 366, 714, 470, 2244, 366, 263, 1139,
29889, 8561, 671, 310, 278, 2472, 297, 263, 1571, 982,
304, 367, 5100, 296, 363, 278, 4982, 29889, 13, 2277,
29937, 2672, 10810, 29965, 9838, 29903, 29901, 13, 29896, 29889,
10306, 278, 1404, 29915, 29879, 15278, 29889, 13, 29906, 29889,
1522, 12187, 3447, 4658, 297, 7535, 29889, 13, 3148, 1001,
29901, 29871, 32000, 29871, 13, 3148, 1001, 29901, 29871, 3148,
1001, 29901, 8453, 278, 4863, 319, 1799, 9047, 13566, 29901]])
###PROMPT: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
USER:
USER: USER: describe the video ASSISTANT:
###LM OUTPUT TEXT You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.
INSTRUCTIONS:
USER:
USER: USER: describe the video ASSISTANT:
Question: describe the video
Answer: