Skip to content
This repository was archived by the owner on Feb 3, 2026. It is now read-only.
This repository was archived by the owner on Feb 3, 2026. It is now read-only.

why the output is blank #81

@BOYJZ

Description

@BOYJZ

I have build my own demo file. after uploading one video, it gives blank output. Could anyone help me out?
-------------------------------Here's the demo file-------------------------

from argparse import ArgumentParser
from tasks.eval.model_utils import load_pllava
from tasks.eval.eval_utils import ChatPllava, conv_plain_v1, conv_templates

SYSTEM = """You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.

INSTRUCTIONS:

  1. Follow the user's instruction.
  2. Be critical yet believe in yourself.
    """

INIT_CONVERSATION = conv_plain_v1.copy()

def init_model(args):
print('Initializing PLLaVA')
model, processor = load_pllava(
args.pretrained_model_name_or_path, args.num_frames,
use_lora=args.use_lora,
weight_dir=args.weight_dir,
lora_alpha=args.lora_alpha,
use_multi_gpus=args.use_multi_gpus)
if not args.use_multi_gpus:
model = model.to('cuda')
chat = ChatPllava(model, processor)
return chat

def run_inference(video_path, question, chat, num_beams=1, temperature=1.0):
# Upload the video
llm_message, img_list, chat_state = chat.upload_video(video_path, INIT_CONVERSATION.copy(), [])

# Ask the question
chat_state = chat.ask(question, chat_state, SYSTEM)

# Get the answer from the model
llm_message, llm_message_token, chat_state = chat.answer(
    conv=chat_state, 
    img_list=img_list, 
    max_new_tokens=200, 
    num_beams=num_beams, 
    temperature=temperature
)

# Print the response
llm_message = llm_message.replace("<s>", "")  # Clean up the output
print(f"Question: {question}")
print(f"Answer: {llm_message}")

def parse_args():
parser = ArgumentParser()
parser.add_argument("--pretrained_model_name_or_path", type=str, required=False, default='/home/PLLaVA/MODELS/pllava-13b')
parser.add_argument("--num_frames", type=int, required=False, default=4)
parser.add_argument("--use_lora", action='store_true')
parser.add_argument("--use_multi_gpus", action='store_true')
parser.add_argument("--lora_alpha", type=int, default=None, help="LoRA alpha parameter.")
parser.add_argument("--weight_dir", type=str, required=False, default=None)
parser.add_argument("--conv_mode", type=str, required=False, default='eval_vcgbench')
parser.add_argument("--video_path", type=str, required=False, default="/home/PLLaVA/EDATA/VCGBench/Test_Videos/v_-D1gdv_gQyw.mp4")
parser.add_argument("--question", type=str, required=False, default="describe the video")
parser.add_argument("--num_beams", type=int, default=1, help="Beam search numbers.")
parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for text generation.")
args = parser.parse_args()
return args

if name == "main":
args = parse_args()

# Initialize the model
chat = init_model(args)
INIT_CONVERSATION = conv_templates[args.conv_mode]

# Run inference
run_inference(args.video_path, args.question, chat, num_beams=args.num_beams, temperature=args.temperature)

-------------------------------Here's the output-------------------------------

Initializing PLLaVA
Loading model from /home/PLLaVA/MODELS/pllava-13b
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 6/6 [00:20<00:00, 3.38s/it]
Some weights of PllavaForConditionalGeneration were not initialized from the model checkpoint at /home/PLLaVA/MODELS/pllava-13b and are newly initialized: ['language_model.lm_head.weight', 'language_model.model.embed_tokens.weight', 'language_model.model.layers.0.input_layernorm.weight', 'language_model.model.layers.0.mlp.down_proj.weight', 'language_model.model.layers.0.mlp.gate_proj.weight', 'language_model.model.layers.0.mlp.up_proj.weight', 'language_model.model.layers.0.post_attention_layernorm.weight', 'language_model.model.layers.0.self_attn.k_proj.weight', 'language_model.model.layers.0.self_attn.o_proj.weight', 'language_model.model.layers.0.self_attn.q_proj.weight', 'language_model.model.layers.0.self_attn.v_proj.weight', 'language_model.model.layers.1.input_layernorm.weight', 'language_model.model.layers.1.mlp.down_proj.weight', 'language_model.model.layers.1.mlp.gate_proj.weight', 'language_model.model.layers.1.mlp.up_proj.weight', 'language_model.model.layers.1.post_attention_layernorm.weight', 'language_model.model.layers.1.self_attn.k_proj.weight', 'language_model.model.layers.1.self_attn.o_proj.weight', 'language_model.model.layers.1.self_attn.q_proj.weight', 'language_model.model.layers.1.self_attn.v_proj.weight', 'language_model.model.layers.10.input_layernorm.weight', 'language_model.model.layers.10.mlp.down_proj.weight', 'language_model.model.layers.10.mlp.gate_proj.weight', 'language_model.model.layers.10.mlp.up_proj.weight', 'language_model.model.layers.10.post_attention_layernorm.weight', 'language_model.model.layers.10.self_attn.k_proj.weight', 'language_model.model.layers.10.self_attn.o_proj.weight', 'language_model.model.layers.10.self_attn.q_proj.weight', 'language_model.model.layers.10.self_attn.v_proj.weight', 'language_model.model.layers.11.input_layernorm.weight', 'language_model.model.layers.11.mlp.down_proj.weight', 'language_model.model.layers.11.mlp.gate_proj.weight', 'language_model.model.layers.11.mlp.up_proj.weight', 'language_model.model.layers.11.post_attention_layernorm.weight', 'language_model.model.layers.11.self_attn.k_proj.weight', 'language_model.model.layers.11.self_attn.o_proj.weight', 'language_model.model.layers.11.self_attn.q_proj.weight', 'language_model.model.layers.11.self_attn.v_proj.weight', 'language_model.model.layers.12.input_layernorm.weight', 'language_model.model.layers.12.mlp.down_proj.weight', 'language_model.model.layers.12.mlp.gate_proj.weight', 'language_model.model.layers.12.mlp.up_proj.weight', 'language_model.model.layers.12.post_attention_layernorm.weight', 'language_model.model.layers.12.self_attn.k_proj.weight', 'language_model.model.layers.12.self_attn.o_proj.weight', 'language_model.model.layers.12.self_attn.q_proj.weight', 'language_model.model.layers.12.self_attn.v_proj.weight', 'language_model.model.layers.13.input_layernorm.weight', 'language_model.model.layers.13.mlp.down_proj.weight', 'language_model.model.layers.13.mlp.gate_proj.weight', 'language_model.model.layers.13.mlp.up_proj.weight', 'language_model.model.layers.13.post_attention_layernorm.weight', 'language_model.model.layers.13.self_attn.k_proj.weight', 'language_model.model.layers.13.self_attn.o_proj.weight', 'language_model.model.layers.13.self_attn.q_proj.weight', 'language_model.model.layers.13.self_attn.v_proj.weight', 'language_model.model.layers.14.input_layernorm.weight', 'language_model.model.layers.14.mlp.down_proj.weight', 'language_model.model.layers.14.mlp.gate_proj.weight', 'language_model.model.layers.14.mlp.up_proj.weight', 'language_model.model.layers.14.post_attention_layernorm.weight', 'language_model.model.layers.14.self_attn.k_proj.weight', 'language_model.model.layers.14.self_attn.o_proj.weight', 'language_model.model.layers.14.self_attn.q_proj.weight', 'language_model.model.layers.14.self_attn.v_proj.weight', 'language_model.model.layers.15.input_layernorm.weight', 'language_model.model.layers.15.mlp.down_proj.weight', 'language_model.model.layers.15.mlp.gate_proj.weight', 'language_model.model.layers.15.mlp.up_proj.weight', 'language_model.model.layers.15.post_attention_layernorm.weight', 'language_model.model.layers.15.self_attn.k_proj.weight', 'language_model.model.layers.15.self_attn.o_proj.weight', 'language_model.model.layers.15.self_attn.q_proj.weight', 'language_model.model.layers.15.self_attn.v_proj.weight', 'language_model.model.layers.16.input_layernorm.weight', 'language_model.model.layers.16.mlp.down_proj.weight', 'language_model.model.layers.16.mlp.gate_proj.weight', 'language_model.model.layers.16.mlp.up_proj.weight', 'language_model.model.layers.16.post_attention_layernorm.weight', 'language_model.model.layers.16.self_attn.k_proj.weight', 'language_model.model.layers.16.self_attn.o_proj.weight', 'language_model.model.layers.16.self_attn.q_proj.weight', 'language_model.model.layers.16.self_attn.v_proj.weight', 'language_model.model.layers.17.input_layernorm.weight', 'language_model.model.layers.17.mlp.down_proj.weight', 'language_model.model.layers.17.mlp.gate_proj.weight', 'language_model.model.layers.17.mlp.up_proj.weight', 'language_model.model.layers.17.post_attention_layernorm.weight', 'language_model.model.layers.17.self_attn.k_proj.weight', 'language_model.model.layers.17.self_attn.o_proj.weight', 'language_model.model.layers.17.self_attn.q_proj.weight', 'language_model.model.layers.17.self_attn.v_proj.weight', 'language_model.model.layers.18.input_layernorm.weight', 'language_model.model.layers.18.mlp.down_proj.weight', 'language_model.model.layers.18.mlp.gate_proj.weight', 'language_model.model.layers.18.mlp.up_proj.weight', 'language_model.model.layers.18.post_attention_layernorm.weight', 'language_model.model.layers.18.self_attn.k_proj.weight', 'language_model.model.layers.18.self_attn.o_proj.weight', 'language_model.model.layers.18.self_attn.q_proj.weight', 'language_model.model.layers.18.self_attn.v_proj.weight', 'language_model.model.layers.19.input_layernorm.weight', 'language_model.model.layers.19.mlp.down_proj.weight', 'language_model.model.layers.19.mlp.gate_proj.weight', 'language_model.model.layers.19.mlp.up_proj.weight', 'language_model.model.layers.19.post_attention_layernorm.weight', 'language_model.model.layers.19.self_attn.k_proj.weight', 'language_model.model.layers.19.self_attn.o_proj.weight', 'language_model.model.layers.19.self_attn.q_proj.weight', 'language_model.model.layers.19.self_attn.v_proj.weight', 'language_model.model.layers.2.input_layernorm.weight', 'language_model.model.layers.2.mlp.down_proj.weight', 'language_model.model.layers.2.mlp.gate_proj.weight', 'language_model.model.layers.2.mlp.up_proj.weight', 'language_model.model.layers.2.post_attention_layernorm.weight', 'language_model.model.layers.2.self_attn.k_proj.weight', 'language_model.model.layers.2.self_attn.o_proj.weight', 'language_model.model.layers.2.self_attn.q_proj.weight', 'language_model.model.layers.2.self_attn.v_proj.weight', 'language_model.model.layers.20.input_layernorm.weight', 'language_model.model.layers.20.mlp.down_proj.weight', 'language_model.model.layers.20.mlp.gate_proj.weight', 'language_model.model.layers.20.mlp.up_proj.weight', 'language_model.model.layers.20.post_attention_layernorm.weight', 'language_model.model.layers.20.self_attn.k_proj.weight', 'language_model.model.layers.20.self_attn.o_proj.weight', 'language_model.model.layers.20.self_attn.q_proj.weight', 'language_model.model.layers.20.self_attn.v_proj.weight', 'language_model.model.layers.21.input_layernorm.weight', 'language_model.model.layers.21.mlp.down_proj.weight', 'language_model.model.layers.21.mlp.gate_proj.weight', 'language_model.model.layers.21.mlp.up_proj.weight', 'language_model.model.layers.21.post_attention_layernorm.weight', 'language_model.model.layers.21.self_attn.k_proj.weight', 'language_model.model.layers.21.self_attn.o_proj.weight', 'language_model.model.layers.21.self_attn.q_proj.weight', 'language_model.model.layers.21.self_attn.v_proj.weight', 'language_model.model.layers.22.input_layernorm.weight', 'language_model.model.layers.22.mlp.down_proj.weight', 'language_model.model.layers.22.mlp.gate_proj.weight', 'language_model.model.layers.22.mlp.up_proj.weight', 'language_model.model.layers.22.post_attention_layernorm.weight', 'language_model.model.layers.22.self_attn.k_proj.weight', 'language_model.model.layers.22.self_attn.o_proj.weight', 'language_model.model.layers.22.self_attn.q_proj.weight', 'language_model.model.layers.22.self_attn.v_proj.weight', 'language_model.model.layers.23.input_layernorm.weight', 'language_model.model.layers.23.mlp.down_proj.weight', 'language_model.model.layers.23.mlp.gate_proj.weight', 'language_model.model.layers.23.mlp.up_proj.weight', 'language_model.model.layers.23.post_attention_layernorm.weight', 'language_model.model.layers.23.self_attn.k_proj.weight', 'language_model.model.layers.23.self_attn.o_proj.weight', 'language_model.model.layers.23.self_attn.q_proj.weight', 'language_model.model.layers.23.self_attn.v_proj.weight', 'language_model.model.layers.24.input_layernorm.weight', 'language_model.model.layers.24.mlp.down_proj.weight', 'language_model.model.layers.24.mlp.gate_proj.weight', 'language_model.model.layers.24.mlp.up_proj.weight', 'language_model.model.layers.24.post_attention_layernorm.weight', 'language_model.model.layers.24.self_attn.k_proj.weight', 'language_model.model.layers.24.self_attn.o_proj.weight', 'language_model.model.layers.24.self_attn.q_proj.weight', 'language_model.model.layers.24.self_attn.v_proj.weight', 'language_model.model.layers.25.input_layernorm.weight', 'language_model.model.layers.25.mlp.down_proj.weight', 'language_model.model.layers.25.mlp.gate_proj.weight', 'language_model.model.layers.25.mlp.up_proj.weight', 'language_model.model.layers.25.post_attention_layernorm.weight', 'language_model.model.layers.25.self_attn.k_proj.weight', 'language_model.model.layers.25.self_attn.o_proj.weight', 'language_model.model.layers.25.self_attn.q_proj.weight', 'language_model.model.layers.25.self_attn.v_proj.weight', 'language_model.model.layers.26.input_layernorm.weight', 'language_model.model.layers.26.mlp.down_proj.weight', 'language_model.model.layers.26.mlp.gate_proj.weight', 'language_model.model.layers.26.mlp.up_proj.weight', 'language_model.model.layers.26.post_attention_layernorm.weight', 'language_model.model.layers.26.self_attn.k_proj.weight', 'language_model.model.layers.26.self_attn.o_proj.weight', 'language_model.model.layers.26.self_attn.q_proj.weight', 'language_model.model.layers.26.self_attn.v_proj.weight', 'language_model.model.layers.27.input_layernorm.weight', 'language_model.model.layers.27.mlp.down_proj.weight', 'language_model.model.layers.27.mlp.gate_proj.weight', 'language_model.model.layers.27.mlp.up_proj.weight', 'language_model.model.layers.27.post_attention_layernorm.weight', 'language_model.model.layers.27.self_attn.k_proj.weight', 'language_model.model.layers.27.self_attn.o_proj.weight', 'language_model.model.layers.27.self_attn.q_proj.weight', 'language_model.model.layers.27.self_attn.v_proj.weight', 'language_model.model.layers.28.input_layernorm.weight', 'language_model.model.layers.28.mlp.down_proj.weight', 'language_model.model.layers.28.mlp.gate_proj.weight', 'language_model.model.layers.28.mlp.up_proj.weight', 'language_model.model.layers.28.post_attention_layernorm.weight', 'language_model.model.layers.28.self_attn.k_proj.weight', 'language_model.model.layers.28.self_attn.o_proj.weight', 'language_model.model.layers.28.self_attn.q_proj.weight', 'language_model.model.layers.28.self_attn.v_proj.weight', 'language_model.model.layers.29.input_layernorm.weight', 'language_model.model.layers.29.mlp.down_proj.weight', 'language_model.model.layers.29.mlp.gate_proj.weight', 'language_model.model.layers.29.mlp.up_proj.weight', 'language_model.model.layers.29.post_attention_layernorm.weight', 'language_model.model.layers.29.self_attn.k_proj.weight', 'language_model.model.layers.29.self_attn.o_proj.weight', 'language_model.model.layers.29.self_attn.q_proj.weight', 'language_model.model.layers.29.self_attn.v_proj.weight', 'language_model.model.layers.3.input_layernorm.weight', 'language_model.model.layers.3.mlp.down_proj.weight', 'language_model.model.layers.3.mlp.gate_proj.weight', 'language_model.model.layers.3.mlp.up_proj.weight', 'language_model.model.layers.3.post_attention_layernorm.weight', 'language_model.model.layers.3.self_attn.k_proj.weight', 'language_model.model.layers.3.self_attn.o_proj.weight', 'language_model.model.layers.3.self_attn.q_proj.weight', 'language_model.model.layers.3.self_attn.v_proj.weight', 'language_model.model.layers.30.input_layernorm.weight', 'language_model.model.layers.30.mlp.down_proj.weight', 'language_model.model.layers.30.mlp.gate_proj.weight', 'language_model.model.layers.30.mlp.up_proj.weight', 'language_model.model.layers.30.post_attention_layernorm.weight', 'language_model.model.layers.30.self_attn.k_proj.weight', 'language_model.model.layers.30.self_attn.o_proj.weight', 'language_model.model.layers.30.self_attn.q_proj.weight', 'language_model.model.layers.30.self_attn.v_proj.weight', 'language_model.model.layers.31.input_layernorm.weight', 'language_model.model.layers.31.mlp.down_proj.weight', 'language_model.model.layers.31.mlp.gate_proj.weight', 'language_model.model.layers.31.mlp.up_proj.weight', 'language_model.model.layers.31.post_attention_layernorm.weight', 'language_model.model.layers.31.self_attn.k_proj.weight', 'language_model.model.layers.31.self_attn.o_proj.weight', 'language_model.model.layers.31.self_attn.q_proj.weight', 'language_model.model.layers.31.self_attn.v_proj.weight', 'language_model.model.layers.32.input_layernorm.weight', 'language_model.model.layers.32.mlp.down_proj.weight', 'language_model.model.layers.32.mlp.gate_proj.weight', 'language_model.model.layers.32.mlp.up_proj.weight', 'language_model.model.layers.32.post_attention_layernorm.weight', 'language_model.model.layers.32.self_attn.k_proj.weight', 'language_model.model.layers.32.self_attn.o_proj.weight', 'language_model.model.layers.32.self_attn.q_proj.weight', 'language_model.model.layers.32.self_attn.v_proj.weight', 'language_model.model.layers.33.input_layernorm.weight', 'language_model.model.layers.33.mlp.down_proj.weight', 'language_model.model.layers.33.mlp.gate_proj.weight', 'language_model.model.layers.33.mlp.up_proj.weight', 'language_model.model.layers.33.post_attention_layernorm.weight', 'language_model.model.layers.33.self_attn.k_proj.weight', 'language_model.model.layers.33.self_attn.o_proj.weight', 'language_model.model.layers.33.self_attn.q_proj.weight', 'language_model.model.layers.33.self_attn.v_proj.weight', 'language_model.model.layers.34.input_layernorm.weight', 'language_model.model.layers.34.mlp.down_proj.weight', 'language_model.model.layers.34.mlp.gate_proj.weight', 'language_model.model.layers.34.mlp.up_proj.weight', 'language_model.model.layers.34.post_attention_layernorm.weight', 'language_model.model.layers.34.self_attn.k_proj.weight', 'language_model.model.layers.34.self_attn.o_proj.weight', 'language_model.model.layers.34.self_attn.q_proj.weight', 'language_model.model.layers.34.self_attn.v_proj.weight', 'language_model.model.layers.35.input_layernorm.weight', 'language_model.model.layers.35.mlp.down_proj.weight', 'language_model.model.layers.35.mlp.gate_proj.weight', 'language_model.model.layers.35.mlp.up_proj.weight', 'language_model.model.layers.35.post_attention_layernorm.weight', 'language_model.model.layers.35.self_attn.k_proj.weight', 'language_model.model.layers.35.self_attn.o_proj.weight', 'language_model.model.layers.35.self_attn.q_proj.weight', 'language_model.model.layers.35.self_attn.v_proj.weight', 'language_model.model.layers.36.input_layernorm.weight', 'language_model.model.layers.36.mlp.down_proj.weight', 'language_model.model.layers.36.mlp.gate_proj.weight', 'language_model.model.layers.36.mlp.up_proj.weight', 'language_model.model.layers.36.post_attention_layernorm.weight', 'language_model.model.layers.36.self_attn.k_proj.weight', 'language_model.model.layers.36.self_attn.o_proj.weight', 'language_model.model.layers.36.self_attn.q_proj.weight', 'language_model.model.layers.36.self_attn.v_proj.weight', 'language_model.model.layers.37.input_layernorm.weight', 'language_model.model.layers.37.mlp.down_proj.weight', 'language_model.model.layers.37.mlp.gate_proj.weight', 'language_model.model.layers.37.mlp.up_proj.weight', 'language_model.model.layers.37.post_attention_layernorm.weight', 'language_model.model.layers.37.self_attn.k_proj.weight', 'language_model.model.layers.37.self_attn.o_proj.weight', 'language_model.model.layers.37.self_attn.q_proj.weight', 'language_model.model.layers.37.self_attn.v_proj.weight', 'language_model.model.layers.38.input_layernorm.weight', 'language_model.model.layers.38.mlp.down_proj.weight', 'language_model.model.layers.38.mlp.gate_proj.weight', 'language_model.model.layers.38.mlp.up_proj.weight', 'language_model.model.layers.38.post_attention_layernorm.weight', 'language_model.model.layers.38.self_attn.k_proj.weight', 'language_model.model.layers.38.self_attn.o_proj.weight', 'language_model.model.layers.38.self_attn.q_proj.weight', 'language_model.model.layers.38.self_attn.v_proj.weight', 'language_model.model.layers.39.input_layernorm.weight', 'language_model.model.layers.39.mlp.down_proj.weight', 'language_model.model.layers.39.mlp.gate_proj.weight', 'language_model.model.layers.39.mlp.up_proj.weight', 'language_model.model.layers.39.post_attention_layernorm.weight', 'language_model.model.layers.39.self_attn.k_proj.weight', 'language_model.model.layers.39.self_attn.o_proj.weight', 'language_model.model.layers.39.self_attn.q_proj.weight', 'language_model.model.layers.39.self_attn.v_proj.weight', 'language_model.model.layers.4.input_layernorm.weight', 'language_model.model.layers.4.mlp.down_proj.weight', 'language_model.model.layers.4.mlp.gate_proj.weight', 'language_model.model.layers.4.mlp.up_proj.weight', 'language_model.model.layers.4.post_attention_layernorm.weight', 'language_model.model.layers.4.self_attn.k_proj.weight', 'language_model.model.layers.4.self_attn.o_proj.weight', 'language_model.model.layers.4.self_attn.q_proj.weight', 'language_model.model.layers.4.self_attn.v_proj.weight', 'language_model.model.layers.5.input_layernorm.weight', 'language_model.model.layers.5.mlp.down_proj.weight', 'language_model.model.layers.5.mlp.gate_proj.weight', 'language_model.model.layers.5.mlp.up_proj.weight', 'language_model.model.layers.5.post_attention_layernorm.weight', 'language_model.model.layers.5.self_attn.k_proj.weight', 'language_model.model.layers.5.self_attn.o_proj.weight', 'language_model.model.layers.5.self_attn.q_proj.weight', 'language_model.model.layers.5.self_attn.v_proj.weight', 'language_model.model.layers.6.input_layernorm.weight', 'language_model.model.layers.6.mlp.down_proj.weight', 'language_model.model.layers.6.mlp.gate_proj.weight', 'language_model.model.layers.6.mlp.up_proj.weight', 'language_model.model.layers.6.post_attention_layernorm.weight', 'language_model.model.layers.6.self_attn.k_proj.weight', 'language_model.model.layers.6.self_attn.o_proj.weight', 'language_model.model.layers.6.self_attn.q_proj.weight', 'language_model.model.layers.6.self_attn.v_proj.weight', 'language_model.model.layers.7.input_layernorm.weight', 'language_model.model.layers.7.mlp.down_proj.weight', 'language_model.model.layers.7.mlp.gate_proj.weight', 'language_model.model.layers.7.mlp.up_proj.weight', 'language_model.model.layers.7.post_attention_layernorm.weight', 'language_model.model.layers.7.self_attn.k_proj.weight', 'language_model.model.layers.7.self_attn.o_proj.weight', 'language_model.model.layers.7.self_attn.q_proj.weight', 'language_model.model.layers.7.self_attn.v_proj.weight', 'language_model.model.layers.8.input_layernorm.weight', 'language_model.model.layers.8.mlp.down_proj.weight', 'language_model.model.layers.8.mlp.gate_proj.weight', 'language_model.model.layers.8.mlp.up_proj.weight', 'language_model.model.layers.8.post_attention_layernorm.weight', 'language_model.model.layers.8.self_attn.k_proj.weight', 'language_model.model.layers.8.self_attn.o_proj.weight', 'language_model.model.layers.8.self_attn.q_proj.weight', 'language_model.model.layers.8.self_attn.v_proj.weight', 'language_model.model.layers.9.input_layernorm.weight', 'language_model.model.layers.9.mlp.down_proj.weight', 'language_model.model.layers.9.mlp.gate_proj.weight', 'language_model.model.layers.9.mlp.up_proj.weight', 'language_model.model.layers.9.post_attention_layernorm.weight', 'language_model.model.layers.9.self_attn.k_proj.weight', 'language_model.model.layers.9.self_attn.o_proj.weight', 'language_model.model.layers.9.self_attn.q_proj.weight', 'language_model.model.layers.9.self_attn.v_proj.weight', 'language_model.model.norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Model loaded
Loading processor from /home/PLLaVA/MODELS/pllava-13b
Processor loaded
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Frame size: (1280, 720), Mode: RGB
Input video shape: 4 1280 720
img_list shape: [(1280, 720), (1280, 720), (1280, 720), (1280, 720)]
Uploaded 1 images for processing.
Generated Prompt: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.

INSTRUCTIONS:

  1. Follow the user's instruction.

  2. Be critical yet believe in yourself.
    USER:
    USER: USER: describe the video ASSISTANT:
    Processed inputs keys: dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
    Pixel values shape: torch.Size([4, 3, 336, 336])
    Pixel values: tensor([[[[-1.1061, -1.1061, -1.1353, ..., -0.2010, -0.2302, -0.2448],
    [-1.1353, -1.1207, -1.1353, ..., -0.2594, -0.2740, -0.2594],
    [-1.1353, -1.1353, -1.1353, ..., -0.0842, -0.2448, -0.2886],
    ...,
    [-0.3908, -0.2302, -0.1280, ..., 0.0325, 0.0179, 0.0179],
    [-0.3470, -0.1864, -0.0988, ..., 0.0325, 0.0179, 0.0179],
    [-0.3178, -0.1426, -0.0550, ..., 0.0325, 0.0179, 0.0179]],

      [[-0.6715, -0.6715, -0.7016,  ...,  0.3190,  0.3040,  0.2890],
       [-0.7016, -0.6865, -0.7016,  ...,  0.2740,  0.2589,  0.2740],
       [-0.7016, -0.7016, -0.7016,  ...,  0.4540,  0.3040,  0.2589],
       ...,
       [-0.2213, -0.0562,  0.0488,  ..., -0.2063, -0.2063, -0.2063],
       [-0.1763, -0.0112,  0.0789,  ..., -0.2063, -0.2063, -0.2063],
       [-0.1463,  0.0338,  0.1239,  ..., -0.2063, -0.2063, -0.2063]],
    
      [[-0.3000, -0.3000, -0.3284,  ...,  0.4395,  0.4110,  0.3684],
       [-0.3284, -0.3142, -0.3284,  ...,  0.3542,  0.2973,  0.2973],
       [-0.3284, -0.3284, -0.3284,  ...,  0.4964,  0.2831,  0.2262],
       ...,
       [-0.2004, -0.0156,  0.0982,  ..., -0.1435, -0.1720, -0.1720],
       [-0.1720,  0.0271,  0.1266,  ..., -0.1435, -0.1720, -0.1720],
       [-0.1293,  0.0698,  0.1693,  ..., -0.1435, -0.1720, -0.1720]]],
    
    
     [[[ 0.6603,  0.7041,  0.8647,  ...,  0.6311,  0.6603,  0.7771],
       [ 0.4851,  0.5873,  0.7041,  ...,  0.5727,  0.5289,  0.6895],
       [ 0.4997,  0.8501,  0.5289,  ...,  0.5873,  0.3099,  0.5143],
       ...,
       [ 0.8501,  1.2734,  1.1712,  ..., -0.1864, -0.7266, -1.3397],
       [ 0.6749,  1.2734,  1.2150,  ...,  0.6457,  0.0325, -0.5660],
       [ 0.5581,  0.9230,  0.8792,  ...,  1.2734,  1.2588,  1.1420]],
    
      [[ 0.7692,  0.8142,  0.9793,  ...,  0.6041,  0.6341,  0.7542],
       [ 0.5891,  0.6942,  0.8142,  ...,  0.5441,  0.4991,  0.6642],
       [ 0.6041,  0.9643,  0.6341,  ...,  0.5591,  0.2740,  0.4991],
       ...,
       [ 0.9193,  1.3545,  1.2495,  ..., -0.1012, -0.6415, -1.2568],
       [ 0.7392,  1.3545,  1.2945,  ...,  0.7542,  0.1239, -0.4764],
       [ 0.6191,  0.9943,  0.9493,  ...,  1.4145,  1.3845,  1.2945]],
    
      [[ 0.6812,  0.7239,  0.8803,  ...,  0.4964,  0.5532,  0.7239],
       [ 0.5106,  0.6101,  0.7239,  ...,  0.4395,  0.4253,  0.6244],
       [ 0.5248,  0.8661,  0.5532,  ...,  0.4537,  0.2120,  0.4679],
       ...,
       [ 0.8661,  1.2785,  1.1789,  ..., -0.1435, -0.6128, -1.1958],
       [ 0.6955,  1.2785,  1.2216,  ...,  0.6812,  0.1266, -0.4422],
       [ 0.5817,  0.9372,  0.8945,  ...,  1.3211,  1.3211,  1.2358]]],
    
    
     [[[ 0.0179, -0.2448, -0.4346,  ...,  0.0763,  0.2953,  0.3391],
       [ 0.2077, -0.1280, -0.3032,  ..., -0.1718, -0.0405,  0.3537],
       [ 0.2369,  0.2369,  0.1347,  ...,  0.5143,  0.0617,  0.2515],
       ...,
       [-0.9164, -0.5368,  0.4267,  ..., -0.2886, -0.1572,  0.2077],
       [-0.5076, -0.2010,  0.4705,  ..., -0.2448,  0.2077,  0.0909],
       [-0.1864,  0.0471,  0.4413,  ...,  0.1055,  0.2807, -0.3324]],
    
      [[ 0.3040,  0.0338, -0.1613,  ...,  0.0638,  0.2890,  0.3190],
       [ 0.4991,  0.1539, -0.0262,  ..., -0.1913, -0.0562,  0.3490],
       [ 0.5291,  0.5291,  0.4240,  ...,  0.5141,  0.0488,  0.2289],
       ...,
       [-0.9117, -0.5215,  0.4691,  ..., -0.1012,  0.0488,  0.4390],
       [-0.4914, -0.1763,  0.5141,  ..., -0.0862,  0.4090,  0.3040],
       [-0.1613,  0.0789,  0.4841,  ...,  0.2740,  0.4841, -0.1313]],
    
      [[ 0.2688,  0.0129, -0.1720,  ...,  0.0129,  0.2120,  0.2831],
       [ 0.4537,  0.1266, -0.0440,  ..., -0.2146, -0.1151,  0.2973],
       [ 0.4821,  0.4821,  0.3826,  ...,  0.4537, -0.0156,  0.1977],
       ...,
       [-0.7977, -0.4279,  0.5106,  ..., -0.0724,  0.0840,  0.4537],
       [-0.3995, -0.1009,  0.5532,  ..., -0.0582,  0.4253,  0.2973],
       [-0.0867,  0.1409,  0.5248,  ...,  0.2831,  0.4537, -0.1435]]],
    
    
     [[[ 0.5873,  0.0763,  0.0033,  ...,  0.1785,  0.1785,  0.5289],
       [ 0.4705,  0.1931,  0.3829,  ...,  0.0179, -0.3324,  0.3975],
       [ 0.3099,  0.2807,  0.4705,  ...,  0.7041,  0.3099,  0.8209],
       ...,
       [ 0.8063,  0.6895,  0.6457,  ...,  0.5581,  0.7333,  0.8501],
       [ 0.6165,  0.5727,  0.7917,  ...,  0.6311,  1.0252,  1.2880],
       [ 0.7771,  0.7625,  0.8792,  ...,  0.4267,  0.9668,  1.2588]],
    
      [[ 0.6191,  0.0939,  0.0188,  ...,  0.1689,  0.1689,  0.5291],
       [ 0.4991,  0.2139,  0.4090,  ...,  0.0038, -0.3564,  0.3940],
       [ 0.3340,  0.3040,  0.4991,  ...,  0.7092,  0.3040,  0.8292],
       ...,
       [ 1.1144,  0.9943,  1.0093,  ...,  0.5291,  0.7242,  0.8743],
       [ 0.9043,  0.8743,  1.1594,  ...,  0.6041,  1.0393,  1.3395],
       [ 1.0694,  1.0694,  1.2344,  ...,  0.3940,  0.9793,  1.3095]],
    
      [[ 0.5959,  0.0982,  0.0271,  ...,  0.0555,  0.0555,  0.3968],
       [ 0.4821,  0.2120,  0.3968,  ..., -0.1009, -0.4422,  0.2688],
       [ 0.3257,  0.2973,  0.4821,  ...,  0.5675,  0.1835,  0.6812],
       ...,
       [ 1.3496,  1.2358,  1.2216,  ...,  0.4110,  0.6101,  0.7666],
       [ 1.1505,  1.1221,  1.3638,  ...,  0.4821,  0.8803,  1.1932],
       [ 1.3069,  1.3069,  1.4491,  ...,  0.2831,  0.8234,  1.1647]]]])
    

Input IDs shape: torch.Size([1, 140])
Input IDs: tensor([[ 1, 887, 526, 263, 13988, 13987, 26494, 678, 271, 29933,
327, 29892, 263, 2919, 18551, 29899, 11675, 20255, 29889, 29871,
13, 3492, 526, 2221, 304, 2274, 278, 4863, 2793, 393,
278, 1404, 8128, 322, 6985, 278, 1404, 297, 263, 4863,
29899, 11675, 4475, 3414, 29889, 13, 1576, 1404, 1795, 3867,
366, 411, 278, 4863, 322, 5505, 777, 4805, 694, 13344,
2472, 304, 1371, 366, 714, 470, 2244, 366, 263, 1139,
29889, 8561, 671, 310, 278, 2472, 297, 263, 1571, 982,
304, 367, 5100, 296, 363, 278, 4982, 29889, 13, 2277,
29937, 2672, 10810, 29965, 9838, 29903, 29901, 13, 29896, 29889,
10306, 278, 1404, 29915, 29879, 15278, 29889, 13, 29906, 29889,
1522, 12187, 3447, 4658, 297, 7535, 29889, 13, 3148, 1001,
29901, 29871, 32000, 29871, 13, 3148, 1001, 29901, 29871, 3148,
1001, 29901, 8453, 278, 4863, 319, 1799, 9047, 13566, 29901]])
###PROMPT: You are a powerful Video Magic ChatBot, a large vision-language assistant.
You are able to understand the video content that the user provides and assist the user in a video-language related task.
The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.

INSTRUCTIONS:

  1. Follow the user's instruction.
  2. Be critical yet believe in yourself.
    USER:
    USER: USER: describe the video ASSISTANT:
    ###LM OUTPUT TEXT You are a powerful Video Magic ChatBot, a large vision-language assistant.
    You are able to understand the video content that the user provides and assist the user in a video-language related task.
    The user might provide you with the video and maybe some extra noisy information to help you out or ask you a question. Make use of the information in a proper way to be competent for the job.

INSTRUCTIONS:

  1. Follow the user's instruction.
  2. Be critical yet believe in yourself.
    USER:
    USER: USER: describe the video ASSISTANT:
    Question: describe the video
    Answer:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions