-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference.py
More file actions
67 lines (56 loc) · 2.22 KB
/
Copy pathinference.py
File metadata and controls
67 lines (56 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
import torchvision.transforms as transforms
from PIL import Image
from model import CNNtoRNN
import pickle
import argparse
import sys
def predict_caption(image_path, model_path, vocab_path):
device = torch.device(
"cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
)
# Load Vocabulary
try:
with open(vocab_path, "rb") as f:
vocab = pickle.load(f)
except FileNotFoundError:
print(f"Vocabulary file {vocab_path} not found. Please train the model first.")
sys.exit(1)
vocab_size = len(vocab)
embed_size = 256
hidden_size = 256
num_layers = 1
# Load Model
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
try:
model.load_state_dict(torch.load(model_path, map_location=device))
except FileNotFoundError:
print(f"Model file {model_path} not found. Please train the model first.")
sys.exit(1)
model.eval()
# Load and Transform Image
try:
image = Image.open(image_path).convert("RGB")
except Exception as e:
print(f"Error opening image: {e}")
sys.exit(1)
transform = transforms.Compose([
transforms.Resize((299, 299)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])
image_tensor = transform(image).unsqueeze(0).to(device)
# Predict caption
caption_list = model.caption_image(image_tensor, vocab)
caption = " ".join(caption_list)
print("--------------------------------------------------")
print(f"Generated Caption: {caption.capitalize()}")
print("--------------------------------------------------")
return caption
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Image Caption Generator Inference")
parser.add_argument("--image", type=str, required=True, help="Path to the image file")
parser.add_argument("--model", type=str, default="caption_model.pth", help="Path to trained model")
parser.add_argument("--vocab", type=str, default="vocab.pkl", help="Path to saved vocabulary")
args = parser.parse_args()
predict_caption(args.image, args.model, args.vocab)