From bce79ed7cccd8bf4f1403e7752a6636b6a406cdd Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:28:04 -0400 Subject: [PATCH 01/32] Create reportreport --- reportreport | 1 + 1 file changed, 1 insertion(+) create mode 100644 reportreport diff --git a/reportreport b/reportreport new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/reportreport @@ -0,0 +1 @@ + From 4dff444b9374a699a598c7ce10413cbe4aa2e0cf Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:30:50 -0400 Subject: [PATCH 02/32] Rename reportreport to report.md --- reportreport => report.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename reportreport => report.md (100%) diff --git a/reportreport b/report.md similarity index 100% rename from reportreport rename to report.md From cc7e2bf0923f05e9ff2c12d4fb2c86bb33d1965c Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:55:42 -0400 Subject: [PATCH 03/32] Update report.md --- report.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/report.md b/report.md index 8b13789..e94f416 100644 --- a/report.md +++ b/report.md @@ -1 +1,4 @@ +# DS598 DL4DS Midterm Project +## Introduction +The project aims to provide image-to-captioning services for blind people using AI technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is AdamW with specific settings: a learning rate of 2e-5 and a weight decay of 5e-4. I trained it 15 epochs, and stopped it early at epoch 3, since it was overfitting afterwards. My best CIDEr-D score on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006) is 75.37. From f177ea3a3a77ab179bde16bb9b3ff9436327b540 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 08:57:52 -0400 Subject: [PATCH 04/32] Rename report.md to REPORT.md --- report.md => REPORT.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename report.md => REPORT.md (100%) diff --git a/report.md b/REPORT.md similarity index 100% rename from report.md rename to REPORT.md From cd0221e5effa8477912633afaadaf37734c349d9 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:18:04 -0400 Subject: [PATCH 05/32] Add files via upload --- src/demo_model/train.py | 60 +++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/src/demo_model/train.py b/src/demo_model/train.py index 6372bdf..c3c9b00 100644 --- a/src/demo_model/train.py +++ b/src/demo_model/train.py @@ -1,23 +1,25 @@ import torch from torch.utils.data import DataLoader, Dataset, Subset from torchvision import transforms +import torch.optim as optim from src.base.constants import * from src.base.helpers import * from src.base.vizwiz_eval_cap.eval import VizWizEvalCap from dataset import DemoDataset ## This is a local import from dataset.pyA from tqdm import tqdm -from transformers import AutoProcessor -from transformers import AutoModelForCausalLM +from transformers import BlipProcessor +from transformers import BlipForConditionalGeneration from PIL import Image import matplotlib.pyplot as plt import os import json + ################################################################################ # This is template code that will not run as is since a model is not defined but # is has much of the infrastructure needed to fine-tune a model on the VizWiz # dataset. -# +#custom # At a minimum you will have to complete code indicated by TODO comments. ################################################################################ @@ -32,7 +34,12 @@ # to encode and decode text and images. # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoProcessor try: - processor = AutoProcessor.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + #processor = AutoProcessor.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + #processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=CACHE_DIR) + #processor = AutoProcessor.from_pretrained("microsoft/git-base", cache_dir=CACHE_DIR) + #processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", cache_dir=CACHE_DIR) + processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=CACHE_DIR) + except Exception as e: print("You need to pick a pre-trained model from HuggingFace.") print("Exception: ", e) @@ -51,8 +58,8 @@ ) ### Use the Subset while debugging ### -# train_dataset = Subset(train_dataset, range(100)) -# val_dataset = Subset(val_dataset, range(10)) +#train_dataset = Subset(train_dataset, range(100)) +#val_dataset = Subset(val_dataset, range(10)) ### Since, subset is used above, the dataset object needs to be called with a .dataset, to access the original dataset. So while using the full dataset, the below is done. ### train_dataset = Subset(train_dataset, range(len(train_dataset))) @@ -64,7 +71,7 @@ print("SANITY CHECK DONE!!") -train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8) +train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=6) val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=32) ## TODO @@ -72,17 +79,21 @@ # model you want to fine-tune. This will allow you to use the model to train and evaluate # on the VizWiz dataset. try: - model = AutoModelForCausalLM.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + #model = AutoModelForCausalLM.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=CACHE_DIR) + except Exception as e: print("You need to pick a pre-trained model from HuggingFace.") print("Exception: ", e) ## TODO Select your model optimizer try: - raise NotImplementedError("Select your model optimizer") - optimizer = None # pick one from torch.optim + # raise NotImplementedError("Select your model optimizer") + # optimizer = None # pick one from t pick an optimizer from torch.optimorch.optim + optimizer = torch.optim.AdamW(model.parameters(), lr=0.00002, betas=(0.9, 0.999), weight_decay=0.0005) + except Exception as e: - print("You need to pick an optimizer from torch.optim.") + print("You need to.") print("Exception: ", e) # Wrap the model with DataParallel only if more than one GPU is available @@ -94,7 +105,8 @@ method = "CIDEr" # method used for comparsions -logger = Logger(f"{DEMO_SAVE_PATH}/logs.log") +i="0" # change the logger path +logger = Logger(f"{DEMO_SAVE_PATH}/logs_{i}.log") # modify for each model def train(loger, train_dataloader, model, optimizer, device, processor): @@ -132,12 +144,16 @@ def evaluate( for idx, batch in enumerate(val_dataloader): image_ids = batch.pop("image_ids").to(device) pixel_values = batch.pop("pixel_values").to(device) - + with torch.no_grad(): outputs = model.generate(pixel_values=pixel_values, max_length=50) + # debug when prediction is empty + # print("Raw Output:", outputs) # Decode the generated ids to text generated_captions = processor.batch_decode(outputs, skip_special_tokens=True) + # debug when prediction is empty + # print("Decoded Output:", generated_captions) # Store the generated captions for img_id, caption in zip(image_ids, generated_captions): @@ -147,11 +163,13 @@ def evaluate( plot_captions_dict[img_id.item()] = caption # Used for plotting # Save the generated captions to a json file - with open(f"{save_path}/generated_captions.json", "w") as f: + # Change the path + with open(f"{save_path}/generated_captions_{i}.json", "w") as f: json.dump(caption_val, f, indent=4) + # Change the path vizwizRes = val_dataset.dataset.vizwiz.loadRes( - f"{save_path}/generated_captions.json" + f"{save_path}/generated_captions_{i}.json" ) vizwizEval = VizWizEvalCap(val_dataset.dataset.vizwiz, vizwizRes) vizwizEval.evaluate() @@ -160,7 +178,7 @@ def evaluate( for method in vizwizEval.eval: logger.info(f" Method: {method}, Score: {vizwizEval.eval[method]:.4f}") - return vizwizEval, vizwizRes, plot_captions_dict + return vizwizEval, vizwizRes, plot_captions_dict, model def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="CIDEr"): @@ -223,7 +241,7 @@ def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="C best_score = 0 -for epoch in range(3): +for epoch in range(16): print(f"Epoch: {epoch+1}") # Wrap the dataloader with tqdm for a progress bar progress_bar = tqdm( @@ -233,10 +251,10 @@ def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="C # Train the model loss = train(logger, train_dataloader, model, optimizer, device, processor) logger.info(f"Loss at epoch {epoch}: {loss}") - + # Evaluate the model every 3 epochs if epoch % 3 == 0: - vizwizEval, vizwizRes, plot_captions_dict = evaluate( + vizwizEval, vizwizRes, plot_captions_dict, model = evaluate( logger, epoch, DEMO_SAVE_PATH, @@ -249,7 +267,9 @@ def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="C score = vizwizEval.eval[method] if score > best_score: best_score = score - model.save_pretrained(f"{DEMO_SAVE_PATH}/best_model") + model.save_pretrained(f"{DEMO_SAVE_PATH}/best_model_0") logger.info(f"New best score: {best_score}. Model saved") get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method) + + \ No newline at end of file From f1126a2b8d79a5ed1390a38da66789ebae5081a3 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:22:25 -0400 Subject: [PATCH 06/32] Update train.py --- src/demo_model/train.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/demo_model/train.py b/src/demo_model/train.py index c3c9b00..4477073 100644 --- a/src/demo_model/train.py +++ b/src/demo_model/train.py @@ -35,9 +35,6 @@ # https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoProcessor try: #processor = AutoProcessor.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) - #processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=CACHE_DIR) - #processor = AutoProcessor.from_pretrained("microsoft/git-base", cache_dir=CACHE_DIR) - #processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", cache_dir=CACHE_DIR) processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=CACHE_DIR) except Exception as e: @@ -272,4 +269,4 @@ def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="C get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method) - \ No newline at end of file + From b4c8337dc83ecc3d95aa652555f09237a136c703 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:25:32 -0400 Subject: [PATCH 07/32] Update train.py --- src/demo_model/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/demo_model/train.py b/src/demo_model/train.py index 4477073..3f59543 100644 --- a/src/demo_model/train.py +++ b/src/demo_model/train.py @@ -26,7 +26,7 @@ CACHE_DIR = os.environ.get("TRANSFORMERS_CACHE") create_directory(DEMO_SAVE_PATH) -create_directory(DEMO_SAVE_PATH + "/examples") +create_directory(DEMO_SAVE_PATH + "/examples_0") ## TODO # You can use the AutoProcessor.from_pretrained() method to load the HuggingFace From bf30831d65839dc92232bb4d98a5d0ad32cf1bf6 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:27:19 -0400 Subject: [PATCH 08/32] Update train.py --- src/demo_model/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/demo_model/train.py b/src/demo_model/train.py index 3f59543..597e8b5 100644 --- a/src/demo_model/train.py +++ b/src/demo_model/train.py @@ -227,13 +227,13 @@ def get_val_examples(vizwizEval, vizwizRes, plot_captions_dict, epoch, method="C # Save the images and captions save_image_captions( - best_img_and_captions, f"{DEMO_SAVE_PATH}/examples/epoch_{epoch}/best/" + best_img_and_captions, f"{DEMO_SAVE_PATH}/examples_0/epoch_{epoch}/best/" ) save_image_captions( - worst_img_and_captions, f"{DEMO_SAVE_PATH}/examples/epoch_{epoch}/worst/" + worst_img_and_captions, f"{DEMO_SAVE_PATH}/examples_0/epoch_{epoch}/worst/" ) save_image_captions( - first_3_img_and_captions, f"{DEMO_SAVE_PATH}/examples/epoch_{epoch}/first_3/" + first_3_img_and_captions, f"{DEMO_SAVE_PATH}/examples_0/epoch_{epoch}/first_3/" ) From 5048c9eed0e0f52587d93e3d11ed63309cd9a854 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:35:43 -0400 Subject: [PATCH 09/32] Add files via upload --- src/demo_model/test.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/demo_model/test.py b/src/demo_model/test.py index 31c8690..6b7edd7 100644 --- a/src/demo_model/test.py +++ b/src/demo_model/test.py @@ -6,8 +6,8 @@ from src.base.vizwiz_eval_cap.eval import VizWizEvalCap from dataset import DemoDataset from tqdm import tqdm -from transformers import AutoProcessor -from transformers import AutoModelForCausalLM +from transformers import BlipProcessor +from transformers import BlipForConditionalGeneration from PIL import Image import matplotlib.pyplot as plt import os @@ -20,10 +20,11 @@ create_directory(DEMO_SAVE_PATH + "/examples") # The path below points to the location where the model was saved -MODEL_PATH = f"{DEMO_SAVE_PATH}/best_model" +MODEL_PATH = f"{DEMO_SAVE_PATH}/best_model_0" # Load your fine tuned model -model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR) +#model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR) +model = BlipForConditionalGeneration.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR) ## TODO # You can use the AutoProcessor.from_pretrained() method to load the HuggingFace @@ -33,7 +34,9 @@ # # Of course you should use the same model you trained with. try: - processor = AutoProcessor.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + #processor = AutoProcessor.from_pretrained("replace-with-model-choice", cache_dir=CACHE_DIR) + processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=CACHE_DIR) + except Exception as e: print("You need to pick a pre-trained model from HuggingFace.") print("Exception: ", e) @@ -70,7 +73,7 @@ {"image_id": img_id.item(), "caption": caption} ) # Used for VizWizEvalCap -with open(DEMO_SAVE_PATH + "/test_captions.json", "w") as f: +with open(DEMO_SAVE_PATH + "/test_captions_0.json", "w") as f: json.dump(caption_val, f, indent=4) print("Test captions saved to disk!!") From f64f6e661f720a3ed6bfec9b9753229fed6a2dee Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:02:07 -0400 Subject: [PATCH 10/32] Create logs_0.log --- RESULTS/git/logs_0.log | 1 + 1 file changed, 1 insertion(+) create mode 100644 RESULTS/git/logs_0.log diff --git a/RESULTS/git/logs_0.log b/RESULTS/git/logs_0.log new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/RESULTS/git/logs_0.log @@ -0,0 +1 @@ + From 6dd6c8a21a5063a0d5134b484735d0839df70722 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:02:32 -0400 Subject: [PATCH 11/32] Add files via upload --- RESULTS/git/logs_0.txt | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 RESULTS/git/logs_0.txt diff --git a/RESULTS/git/logs_0.txt b/RESULTS/git/logs_0.txt new file mode 100644 index 0000000..49d3a02 --- /dev/null +++ b/RESULTS/git/logs_0.txt @@ -0,0 +1,37 @@ +2024-04-04 21:31:39 - INFO - Loss at epoch 0: 1.4142827987670898 +2024-04-04 21:46:26 - INFO - Validation scores at epoch: 0 +2024-04-04 21:46:26 - INFO - Method: Bleu_1, Score: 0.6716 +2024-04-04 21:46:26 - INFO - Method: Bleu_2, Score: 0.4842 +2024-04-04 21:46:26 - INFO - Method: Bleu_3, Score: 0.3374 +2024-04-04 21:46:26 - INFO - Method: Bleu_4, Score: 0.2291 +2024-04-04 21:46:26 - INFO - Method: CIDEr, Score: 0.6518 +2024-04-04 21:46:28 - INFO - New best score: 0.6518478374556982. Model saved +2024-04-04 22:40:13 - INFO - Loss at epoch 1: 1.4064964056015015 +2024-04-04 23:34:47 - INFO - Loss at epoch 2: 1.402627944946289 +2024-04-05 00:29:21 - INFO - Loss at epoch 3: 1.4132815599441528 +2024-04-05 00:42:06 - INFO - Validation scores at epoch: 3 +2024-04-05 00:42:06 - INFO - Method: Bleu_1, Score: 0.6770 +2024-04-05 00:42:06 - INFO - Method: Bleu_2, Score: 0.4912 +2024-04-05 00:42:06 - INFO - Method: Bleu_3, Score: 0.3455 +2024-04-05 00:42:06 - INFO - Method: Bleu_4, Score: 0.2375 +2024-04-05 00:42:06 - INFO - Method: CIDEr, Score: 0.7051 +2024-04-05 00:42:07 - INFO - New best score: 0.70508115339003. Model saved +2024-04-05 01:36:36 - INFO - Loss at epoch 4: 1.4088084697723389 +2024-04-05 02:30:36 - INFO - Loss at epoch 5: 1.424518346786499 +2024-04-05 03:23:51 - INFO - Loss at epoch 6: 1.3944379091262817 +2024-04-05 03:36:20 - INFO - Validation scores at epoch: 6 +2024-04-05 03:36:20 - INFO - Method: Bleu_1, Score: 0.6757 +2024-04-05 03:36:20 - INFO - Method: Bleu_2, Score: 0.4938 +2024-04-05 03:36:20 - INFO - Method: Bleu_3, Score: 0.3489 +2024-04-05 03:36:20 - INFO - Method: Bleu_4, Score: 0.2419 +2024-04-05 03:36:20 - INFO - Method: CIDEr, Score: 0.7261 +2024-04-05 03:36:22 - INFO - New best score: 0.726137869582739. Model saved +2024-04-05 04:30:43 - INFO - Loss at epoch 7: 1.396859884262085 +2024-04-05 05:24:52 - INFO - Loss at epoch 8: 1.3920198678970337 +2024-04-05 06:19:02 - INFO - Loss at epoch 9: 1.4171195030212402 +2024-04-05 06:31:05 - INFO - Validation scores at epoch: 9 +2024-04-05 06:31:05 - INFO - Method: Bleu_1, Score: 0.6786 +2024-04-05 06:31:05 - INFO - Method: Bleu_2, Score: 0.4952 +2024-04-05 06:31:05 - INFO - Method: Bleu_3, Score: 0.3530 +2024-04-05 06:31:05 - INFO - Method: Bleu_4, Score: 0.2466 +2024-04-05 06:31:05 - INFO - Method: CIDEr, Score: 0.7126 From 67c5fb7f4bdff33da6a911a64bcff9aef44f2320 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:04:11 -0400 Subject: [PATCH 12/32] Delete RESULTS/git/logs_0.log --- RESULTS/git/logs_0.log | 1 - 1 file changed, 1 deletion(-) delete mode 100644 RESULTS/git/logs_0.log diff --git a/RESULTS/git/logs_0.log b/RESULTS/git/logs_0.log deleted file mode 100644 index 8b13789..0000000 --- a/RESULTS/git/logs_0.log +++ /dev/null @@ -1 +0,0 @@ - From 6450c74d1c4ab172b9c6aaf6224dbc88e8009497 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:04:26 -0400 Subject: [PATCH 13/32] Delete RESULTS/git/logs_0.txt --- RESULTS/git/logs_0.txt | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 RESULTS/git/logs_0.txt diff --git a/RESULTS/git/logs_0.txt b/RESULTS/git/logs_0.txt deleted file mode 100644 index 49d3a02..0000000 --- a/RESULTS/git/logs_0.txt +++ /dev/null @@ -1,37 +0,0 @@ -2024-04-04 21:31:39 - INFO - Loss at epoch 0: 1.4142827987670898 -2024-04-04 21:46:26 - INFO - Validation scores at epoch: 0 -2024-04-04 21:46:26 - INFO - Method: Bleu_1, Score: 0.6716 -2024-04-04 21:46:26 - INFO - Method: Bleu_2, Score: 0.4842 -2024-04-04 21:46:26 - INFO - Method: Bleu_3, Score: 0.3374 -2024-04-04 21:46:26 - INFO - Method: Bleu_4, Score: 0.2291 -2024-04-04 21:46:26 - INFO - Method: CIDEr, Score: 0.6518 -2024-04-04 21:46:28 - INFO - New best score: 0.6518478374556982. Model saved -2024-04-04 22:40:13 - INFO - Loss at epoch 1: 1.4064964056015015 -2024-04-04 23:34:47 - INFO - Loss at epoch 2: 1.402627944946289 -2024-04-05 00:29:21 - INFO - Loss at epoch 3: 1.4132815599441528 -2024-04-05 00:42:06 - INFO - Validation scores at epoch: 3 -2024-04-05 00:42:06 - INFO - Method: Bleu_1, Score: 0.6770 -2024-04-05 00:42:06 - INFO - Method: Bleu_2, Score: 0.4912 -2024-04-05 00:42:06 - INFO - Method: Bleu_3, Score: 0.3455 -2024-04-05 00:42:06 - INFO - Method: Bleu_4, Score: 0.2375 -2024-04-05 00:42:06 - INFO - Method: CIDEr, Score: 0.7051 -2024-04-05 00:42:07 - INFO - New best score: 0.70508115339003. Model saved -2024-04-05 01:36:36 - INFO - Loss at epoch 4: 1.4088084697723389 -2024-04-05 02:30:36 - INFO - Loss at epoch 5: 1.424518346786499 -2024-04-05 03:23:51 - INFO - Loss at epoch 6: 1.3944379091262817 -2024-04-05 03:36:20 - INFO - Validation scores at epoch: 6 -2024-04-05 03:36:20 - INFO - Method: Bleu_1, Score: 0.6757 -2024-04-05 03:36:20 - INFO - Method: Bleu_2, Score: 0.4938 -2024-04-05 03:36:20 - INFO - Method: Bleu_3, Score: 0.3489 -2024-04-05 03:36:20 - INFO - Method: Bleu_4, Score: 0.2419 -2024-04-05 03:36:20 - INFO - Method: CIDEr, Score: 0.7261 -2024-04-05 03:36:22 - INFO - New best score: 0.726137869582739. Model saved -2024-04-05 04:30:43 - INFO - Loss at epoch 7: 1.396859884262085 -2024-04-05 05:24:52 - INFO - Loss at epoch 8: 1.3920198678970337 -2024-04-05 06:19:02 - INFO - Loss at epoch 9: 1.4171195030212402 -2024-04-05 06:31:05 - INFO - Validation scores at epoch: 9 -2024-04-05 06:31:05 - INFO - Method: Bleu_1, Score: 0.6786 -2024-04-05 06:31:05 - INFO - Method: Bleu_2, Score: 0.4952 -2024-04-05 06:31:05 - INFO - Method: Bleu_3, Score: 0.3530 -2024-04-05 06:31:05 - INFO - Method: Bleu_4, Score: 0.2466 -2024-04-05 06:31:05 - INFO - Method: CIDEr, Score: 0.7126 From 9892a0c69e4a80b04b1d8f17005d656130574900 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:06:37 -0400 Subject: [PATCH 14/32] Add files via upload --- src/base/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/constants.py b/src/base/constants.py index a2c80c1..9c6c5bb 100644 --- a/src/base/constants.py +++ b/src/base/constants.py @@ -5,7 +5,7 @@ import spacy # set this path to where you want to save results -BASE_DIR = "/projectnb/ds598/projects/tgardos/sp2024_midterm/" +BASE_DIR = "/projectnb/ds598/students/lilinj/sp2024_midterm/" # Do not edit. This points to the dataset folder DATA_BASE_DIR = "/projectnb/ds598/materials/datasets/vizwiz/captions/" From 3f534550bcb21af69d19dbb89dcc2d2699310a1b Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:08:38 -0400 Subject: [PATCH 15/32] Create 1 --- results/1 | 1 + 1 file changed, 1 insertion(+) create mode 100644 results/1 diff --git a/results/1 b/results/1 new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/results/1 @@ -0,0 +1 @@ + From 4d5b097dc2fcc467870222a84bbd3e4109a9e08b Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:12:09 -0400 Subject: [PATCH 16/32] Update REPORT.md --- REPORT.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/REPORT.md b/REPORT.md index e94f416..2fe02ca 100644 --- a/REPORT.md +++ b/REPORT.md @@ -1,4 +1,29 @@ # DS598 DL4DS Midterm Project ## Introduction -The project aims to provide image-to-captioning services for blind people using AI technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is AdamW with specific settings: a learning rate of 2e-5 and a weight decay of 5e-4. I trained it 15 epochs, and stopped it early at epoch 3, since it was overfitting afterwards. My best CIDEr-D score on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006) is 75.37. +The project aims to provide image-to-captioning services for blind people using AI technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is AdamW with specific settings: a learning rate of 2e-5 and a weight decay of 5e-4. I trained it 15 epochs, and stopped it early at epoch 6, since it was overfitting afterwards. My best CIDEr-D score on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006) is 75.37. + +## Model performance +### + + +## Implementation Details and Challanges + +1. I browsed through the image-to-text models on the [huggingface website](https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) for basic information about these models, and fed dataset images into the reference API to evaluate the pre-trained models' outputs. Then, I selected the models like [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. + +2. I experimented with various optimizers, including SGD, Adam, and AdamW. Since a high default learning rate would cause all inputs to yield few or identical outputs, I reduced and fine-tuned the learning rate to 2e-5. I also fine-tuned the weight decay to 5e-4. + +3. To prevent model overfitting, I adopted measures such as early stopping, batch size reduction, and L2 regularization. + +## Limitation and Reflection +1. Facing with issues like debugging empty outputs, CUDA version mismatches, limited computational resources, and long training times, my exploration of diverse models was constrained. + +2. I didn't try methods like data augmentation and dropout that could have potentially improved the model's robustness and generalization capabilities. + +## References +1. [CIDEr: Consensus-based image description evaluation](https://ieeexplore.ieee.org/document/7299087) +2. [BLEU: A Misunderstood Metric from Another Age](https://towardsdatascience.com/bleu-a-misunderstood-metric-from-another-age-d434e18f1b37), Medium Post +3. [BLEU Metric](https://huggingface.co/spaces/evaluate-metric/bleu), HuggingFace space +4. [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) +5. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) +6. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) From 7e2219184f81cb28672b90af2adee9f9061cd6ab Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:49:15 -0400 Subject: [PATCH 17/32] Update demo_train.sh --- demo_train.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/demo_train.sh b/demo_train.sh index b497ff3..ac4fdcc 100644 --- a/demo_train.sh +++ b/demo_train.sh @@ -9,9 +9,11 @@ module load academic-ml/spring-2024 conda activate spring-2024-pyt # Change this path to point to your project directory -export PYTHONPATH="/projectnb/ds598/admin/tgardos/sp2024_midterm:$PYTHONPATH" +export PYTHONPATH="/projectnb/ds598/students/lilinj/sp2024_midterm:$PYTHONPATH" +#python -m spacy download en_core_web_sm # download spacy model python src/demo_model/train.py -### The command below is used to submit the job to the cluster +### The commands below are used to submit the job to the cluster ### qsub -pe omp 4 -P ds598 -l gpus=1 demo_train.sh +### qsub -l gpus=1 -l gpu_c=7.0 -pe omp 8 demo_train.sh From 6daaf0ff94bbcd97c38d6535b12d44940b0f5a35 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:50:08 -0400 Subject: [PATCH 18/32] Update demo_test.sh --- demo_test.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/demo_test.sh b/demo_test.sh index ec07167..52a6f21 100644 --- a/demo_test.sh +++ b/demo_test.sh @@ -9,9 +9,10 @@ module load academic-ml/spring-2024 conda activate spring-2024-pyt # Change this path to point to your project directory -export PYTHONPATH="/projectnb/ds598/admin/tgardos/sp2024_midterm:$PYTHONPATH" # Set this!!! +export PYTHONPATH="/projectnb/ds598/students/lilinj/sp2024_midterm:$PYTHONPATH" # Set this!!! python src/demo_model/test.py -### The command below is used to submit the job to the cluster -### qsub -pe omp 4 -P ds598 -l gpus=1 git_test.sh +### The commands below are used to submit the job to the cluster +### qsub -pe omp 4 -P ds598 -l gpus=1 demo_test.sh +### qsub -l gpus=1 -l gpu_c=7.0 -pe omp 8 demo_test.sh From 876dc1ac78fdf5e5211d9e645cf1dff8cc0030df Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:51:42 -0400 Subject: [PATCH 19/32] Update cnnlstm_train.sh --- cnnlstm_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cnnlstm_train.sh b/cnnlstm_train.sh index 37d48e8..6c2cfa0 100644 --- a/cnnlstm_train.sh +++ b/cnnlstm_train.sh @@ -9,7 +9,7 @@ module load academic-ml/spring-2024 conda activate spring-2024-pyt # Change this path to point to your project directory -export PYTHONPATH="/projectnb/ds598/admin/tgardos/sp2024_midterm:$PYTHONPATH" # Set this!!! +PYTHONPATH="/projectnb/ds598/students/lilinj/sp2024_midterm:$PYTHONPATH" # Set this!!! python -m spacy download en_core_web_sm # download spacy model python src/cnn_lstm/train.py From b6e7e159ff971f799134330bf0d773d91ea85017 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:52:41 -0400 Subject: [PATCH 20/32] Update cnnlstm_test.sh --- cnnlstm_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cnnlstm_test.sh b/cnnlstm_test.sh index c329dda..6cc54c7 100644 --- a/cnnlstm_test.sh +++ b/cnnlstm_test.sh @@ -9,7 +9,7 @@ module load academic-ml/spring-2024 conda activate spring-2024-pyt # Change this path to point to your project directory -export PYTHONPATH="/projectnb/ds598/admin/tgardos/sp2024_midterm:$PYTHONPATH" +PYTHONPATH="/projectnb/ds598/students/lilinj/sp2024_midterm:$PYTHONPATH" #python -m spacy download en_core_web_sm # download spacy model python src/cnn_lstm/test.py From 1d0d7d17f8488f2af11c7ce4b5c5efdd03b390e5 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 17:53:00 -0400 Subject: [PATCH 21/32] Update cnnlstm_train.sh --- cnnlstm_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cnnlstm_train.sh b/cnnlstm_train.sh index 6c2cfa0..43d500f 100644 --- a/cnnlstm_train.sh +++ b/cnnlstm_train.sh @@ -11,7 +11,7 @@ conda activate spring-2024-pyt # Change this path to point to your project directory PYTHONPATH="/projectnb/ds598/students/lilinj/sp2024_midterm:$PYTHONPATH" # Set this!!! -python -m spacy download en_core_web_sm # download spacy model +#python -m spacy download en_core_web_sm # download spacy model python src/cnn_lstm/train.py ### The command below is used to submit the job to the cluster From 5b2c62a1db66baa5a6fd88fdbefa31df4e210567 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:02:19 -0400 Subject: [PATCH 22/32] Update README.md --- README.md | 95 +++++++------------------------------------------------ 1 file changed, 12 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index 9062b8d..4502f24 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,18 @@ # DS598 DL4DS Midterm Project ## Introduction -For this project, you will train a network to generate captions for the -[VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). -The images are taken by people who are blind and typically rely on -human-based image captioning services. Your objective will be to beat a -a baseline score on the [test set leaderboard](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006). - -## Developer Setup - -Clone this repo to your directory on the SCC DS598 project space, e.g. -`/projectnb/ds598/students/`. - -Once you have a training script setup, create a shell script, e.g. `train.sh`, -that loads and activates a conda environment and then runs your training -script. An example shell script is below. - -```sh -#!/bin/bash -l - -# Set SCC project -#$ -P ds598 - -# load and activate the academic-ml conda environment on SCC -module load miniconda -module load academic-ml/spring-2024 -conda activate spring-2024-pyt - -# Add the path to your source project directory to the python search path -# so that the local `import` commands will work. -export PYTHONPATH="/projectnb/ds598/students//:$PYTHONPATH" - -# Update this path to point to your training file -python path/to/train.py - -# After updating the two paths above, run the command below from an SCC -# command prompt in the same directory as this file to submit this as a -# batch job. -### qsub -pe omp 4 -P ds598 -l gpus=1 train.sh -``` - -Note that there are train and test scripts for the two folders already. - -## Run Example Scripts - -When you run the example scripts, make sure to add the path to the repo -folder before running the script. - -```export PYTHONPATH="/projectnb/ds598/path/to/folder:$PYTHONPATH"``` - -The example shell scripts include this command. - - -Set the paths in `src/base/constants.py` to the correct paths on your system. - -Follow the .sh files to run the code. As an example, to run the `cnnlstm_train.sh` -script, you would run at the command prompt from the base of your local repo -folder: - -```sh -$ qsub -pe omp 4 -P ds598 -l gpus=1 cnnlstm_train.sh -Your job 5437870 ("cnnlstm_train.sh") has been submitted -``` -As shown, you should get notification that your job was submitted and get a -job ID number. - -You can check your job status by typing: - -```sh -$ qstat -u -ob-ID prior name user state submit/start at queue slots ja-task-ID ------------------------------------------------------------------------------------------------------------------ -5437870 0.00000 cnnlstm_tr tgardos qw 03/14/2024 09:40:24 -``` - -The above is showing the example output from user `tgardos`. +The project aims to provide image-to-caption services for blind people using Transformer technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) with a learning rate of 2e-5 and a weight decay of 5e-4. The model is set to train for up to 16 epochs, but training is stopped early at epoch 7, since it is overfitting afterwards. The batch sizes of training and validation are 6 and 32 respectively. The model achieved a CIDEr-D score of 75.37 on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006). ## Dataset +The dataset used in this project is the VizWiz-Captions dataset, which includes 39,181 images sourced from individuals who are blind. Each image is accompanied by 5 descriptive captions. -The dataset is downloaded to -`/projectnb/ds598/materials/datasets/vizwiz/captions`. There is no need to -download the dataset again and the path has already been defined in the -accompanying code. +Download the dataset from the website [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/) and update the paths of annotation_file and image_folder in `src/base/dataset.py`. ## Evaluation -In the VizWiz challenge evaluation they refer to five different evaluation -metrics although they use CIDr-D as their primary evaluation. +In the VizWiz challenge evaluation they refer to five different evaluation metrics although they use CIDr-D as their primary evaluation. -They reference the BLUE metric, but there are limitations to that metric as -described in [2] below. +They reference the BLUE metric, but there are limitations to that metric as described in [2] below. ### Validation Results @@ -109,11 +32,17 @@ Step-by-step instructions will be added here shortly. State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking that you get a **minimum CIDEr-D test score of 50**. -## References +## Limitation and Reflection +1. I faced challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times. Consequently, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. + +2. I didn't try methods like data augmentation and dropout that could have potentially improved the model's robustness and generalization capabilities. +## References 1. [CIDEr: Consensus-based image description evaluation](https://ieeexplore.ieee.org/document/7299087) 2. [BLEU: A Misunderstood Metric from Another Age](https://towardsdatascience.com/bleu-a-misunderstood-metric-from-another-age-d434e18f1b37), Medium Post 3. [BLEU Metric](https://huggingface.co/spaces/evaluate-metric/bleu), HuggingFace space +4. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) +5. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) From a3979b8fb6594c9fb1b32977f2764565d9b4ee21 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:10:25 -0400 Subject: [PATCH 23/32] Update README.md --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4502f24..bd0a1c8 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Step-by-step instructions will be added here shortly. State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking that you get a **minimum CIDEr-D test score of 50**. ## Limitation and Reflection -1. I faced challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times. Consequently, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. +1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times. Consequently, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. 2. I didn't try methods like data augmentation and dropout that could have potentially improved the model's robustness and generalization capabilities. @@ -41,8 +41,9 @@ State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking 1. [CIDEr: Consensus-based image description evaluation](https://ieeexplore.ieee.org/document/7299087) 2. [BLEU: A Misunderstood Metric from Another Age](https://towardsdatascience.com/bleu-a-misunderstood-metric-from-another-age-d434e18f1b37), Medium Post 3. [BLEU Metric](https://huggingface.co/spaces/evaluate-metric/bleu), HuggingFace space -4. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) -5. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) +4. +5. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) +6. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) From abd82021b8fda2024035c512ef4bc0f93f804ee0 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:11:46 -0400 Subject: [PATCH 24/32] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bd0a1c8..3c9913e 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Step-by-step instructions will be added here shortly. State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking that you get a **minimum CIDEr-D test score of 50**. ## Limitation and Reflection -1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times. Consequently, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. +1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. 2. I didn't try methods like data augmentation and dropout that could have potentially improved the model's robustness and generalization capabilities. @@ -41,7 +41,7 @@ State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking 1. [CIDEr: Consensus-based image description evaluation](https://ieeexplore.ieee.org/document/7299087) 2. [BLEU: A Misunderstood Metric from Another Age](https://towardsdatascience.com/bleu-a-misunderstood-metric-from-another-age-d434e18f1b37), Medium Post 3. [BLEU Metric](https://huggingface.co/spaces/evaluate-metric/bleu), HuggingFace space -4. +4. [image-to-text models](https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) 5. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) 6. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) From 1ca18b6a45656332d7eaf9e62883d983f28c718a Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:14:17 -0400 Subject: [PATCH 25/32] Delete REPORT.md --- REPORT.md | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 REPORT.md diff --git a/REPORT.md b/REPORT.md deleted file mode 100644 index 2fe02ca..0000000 --- a/REPORT.md +++ /dev/null @@ -1,29 +0,0 @@ -# DS598 DL4DS Midterm Project - -## Introduction -The project aims to provide image-to-captioning services for blind people using AI technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is AdamW with specific settings: a learning rate of 2e-5 and a weight decay of 5e-4. I trained it 15 epochs, and stopped it early at epoch 6, since it was overfitting afterwards. My best CIDEr-D score on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006) is 75.37. - -## Model performance -### - - -## Implementation Details and Challanges - -1. I browsed through the image-to-text models on the [huggingface website](https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) for basic information about these models, and fed dataset images into the reference API to evaluate the pre-trained models' outputs. Then, I selected the models like [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. - -2. I experimented with various optimizers, including SGD, Adam, and AdamW. Since a high default learning rate would cause all inputs to yield few or identical outputs, I reduced and fine-tuned the learning rate to 2e-5. I also fine-tuned the weight decay to 5e-4. - -3. To prevent model overfitting, I adopted measures such as early stopping, batch size reduction, and L2 regularization. - -## Limitation and Reflection -1. Facing with issues like debugging empty outputs, CUDA version mismatches, limited computational resources, and long training times, my exploration of diverse models was constrained. - -2. I didn't try methods like data augmentation and dropout that could have potentially improved the model's robustness and generalization capabilities. - -## References -1. [CIDEr: Consensus-based image description evaluation](https://ieeexplore.ieee.org/document/7299087) -2. [BLEU: A Misunderstood Metric from Another Age](https://towardsdatascience.com/bleu-a-misunderstood-metric-from-another-age-d434e18f1b37), Medium Post -3. [BLEU Metric](https://huggingface.co/spaces/evaluate-metric/bleu), HuggingFace space -4. [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) -5. [image_captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning) -6. [BlipForConditionalGeneration](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.BlipForConditionalGeneration) From 5012d0100e5f41d2c98142796f7407b7ed7ffa9c Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:44:37 -0400 Subject: [PATCH 26/32] Update README.md --- README.md | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3c9913e..50d3083 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ # DS598 DL4DS Midterm Project ## Introduction + The project aims to provide image-to-caption services for blind people using Transformer technology. The project employs the [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), fine-tuned on the [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/). The optimizer is [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) with a learning rate of 2e-5 and a weight decay of 5e-4. The model is set to train for up to 16 epochs, but training is stopped early at epoch 7, since it is overfitting afterwards. The batch sizes of training and validation are 6 and 32 respectively. The model achieved a CIDEr-D score of 75.37 on the [test dataset](https://eval.ai/web/challenges/challenge-page/739/leaderboard/2006). ## Dataset + The dataset used in this project is the VizWiz-Captions dataset, which includes 39,181 images sourced from individuals who are blind. Each image is accompanied by 5 descriptive captions. Download the dataset from the website [VizWiz Image Captioning dataset](https://vizwiz.org/tasks-and-datasets/image-captioning/) and update the paths of annotation_file and image_folder in `src/base/dataset.py`. @@ -16,21 +18,30 @@ They reference the BLUE metric, but there are limitations to that metric as desc ### Validation Results -Validation set results are reported in the CNN-LSTM example and code for reporting validation results are in the demo model code. - -### Test Results - -As is typically the case, the test dataset labels are withheld, and so the only way to get test results is to produce predicted captions and -then submit them to the VizWiz Image Captioning [Evaluation Server](https://eval.ai/web/challenges/challenge-page/739/overview). There are -scripts in both model directories to create the test submission file, although the demo model test script will have to be updated with model -information. +At Epoch 7, the training loss was 1.3944. The performance scores for this epoch are as follows: -Create an account on the [Evaluation Server](https://eval.ai/web/challenges/challenge-page/739/overview) and submit your test predictions -to get your result. +| Metric | Score | +|---------|---------| +| BLEU-1 | 0.6757 | +| BLEU-2 | 0.4938 | +| BLEU-3 | 0.3489 | +| BLEU-4 | 0.2419 | +| CIDEr | 0.7261 | -Step-by-step instructions will be added here shortly. +### Test Results -State-of-the-art CIDEr-D scores on VizWiz Image Captioning is ~125. We're asking that you get a **minimum CIDEr-D test score of 50**. +I submitted my test results to the VizWiz Image Captioning [Evaluation Server](https://eval.ai/web/challenges/challenge-page/739/overview). Here are the performance scores obtained: + +| Metric | Score | +|---------|-------| +| BLEU-1 | 68.49 | +| BLEU-2 | 50.20 | +| BLEU-3 | 35.68 | +| BLEU-4 | 24.89 | +| ROUGE-L | 48.51 | +| METEOR | 22.06 | +| **CIDEr** | **75.37** | +| SPICE | 17.48 | ## Limitation and Reflection 1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. From e38897a5eb768bf7f123044ac3ac44ae598b6e09 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 21:48:36 -0400 Subject: [PATCH 27/32] Delete results/1 --- results/1 | 1 - 1 file changed, 1 deletion(-) delete mode 100644 results/1 diff --git a/results/1 b/results/1 deleted file mode 100644 index 8b13789..0000000 --- a/results/1 +++ /dev/null @@ -1 +0,0 @@ - From 52696c21808f9fdfdd38b196942f340de0af8ee4 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:25:45 -0400 Subject: [PATCH 28/32] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 50d3083..2bb25e5 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,12 @@ I submitted my test results to the VizWiz Image Captioning [Evaluation Server](h | **CIDEr** | **75.37** | | SPICE | 17.48 | +## Implementation Suggestions + +1. Explore trending image-to-text models on the [huggingface repository](https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) for alternatives, and feed dataset images into the reference API to evaluate the pre-trained models' outputs. + +2. The default learning rates for optimizers such as SGD, Adam, and AdamW are too high for fine-tuning, potentially leading to similar outputs across different inputs. It is recommended to adjust the learning rate to between 1e-5 and 5e-4. + ## Limitation and Reflection 1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. From 3565a89c7ac4ebbc12536bfd08536f5f3e5c5b37 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:26:34 -0400 Subject: [PATCH 29/32] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2bb25e5..e606f32 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ I submitted my test results to the VizWiz Image Captioning [Evaluation Server](h 1. Explore trending image-to-text models on the [huggingface repository](https://huggingface.co/models?pipeline_tag=image-to-text&sort=trending) for alternatives, and feed dataset images into the reference API to evaluate the pre-trained models' outputs. -2. The default learning rates for optimizers such as SGD, Adam, and AdamW are too high for fine-tuning, potentially leading to similar outputs across different inputs. It is recommended to adjust the learning rate to between 1e-5 and 5e-4. +2. The default learning rates for optimizers such as SGD, Adam, and AdamW are too high for fine-tuning, potentially leading to similar outputs across different inputs. It is recommended to adjust the learning rate to between 1e-5 and 5e-5. ## Limitation and Reflection 1. Facing with challenges such as debugging empty predictions, CUDA version mismatches, limited computational resources, and long training times, my experimentation was limited to a few models such as [blip-image-captioning-base model](https://huggingface.co/Salesforce/blip-image-captioning-base), [blip-image-captioning-large model](https://huggingface.co/Salesforce/blip-image-captioning-large), and [git-base](https://huggingface.co/microsoft/git-base) for fine-tuning. From 4b2c8674664c00a3430f45138805367b7c0df327 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:27:12 -0400 Subject: [PATCH 30/32] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e606f32..6b2098c 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ At Epoch 7, the training loss was 1.3944. The performance scores for this epoch | BLEU-2 | 0.4938 | | BLEU-3 | 0.3489 | | BLEU-4 | 0.2419 | -| CIDEr | 0.7261 | +| **CIDEr** | **0.7261** | ### Test Results From 2a7ef4f97faa04350eb9905c18b94de69a6d6b9f Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 23:16:55 -0400 Subject: [PATCH 31/32] Update README.md --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 6b2098c..063c691 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,16 @@ At Epoch 7, the training loss was 1.3944. The performance scores for this epoch | BLEU-4 | 0.2419 | | **CIDEr** | **0.7261** | +Here are two examples of the model's predictions: + +Good example: + +![good example](https://i.postimg.cc/rzyv09gc/good-example.png) + +Bad example: + +![bad example](https://i.postimg.cc/SYK5Byjj/bad-example.png) + ### Test Results I submitted my test results to the VizWiz Image Captioning [Evaluation Server](https://eval.ai/web/challenges/challenge-page/739/overview). Here are the performance scores obtained: From 464407805a66282388b1efc1c8c5e9857a8c8146 Mon Sep 17 00:00:00 2001 From: lilinSTART <144761144+lilinSTART@users.noreply.github.com> Date: Sat, 6 Apr 2024 23:20:12 -0400 Subject: [PATCH 32/32] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 063c691..4a48e38 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,11 @@ Here are two examples of the model's predictions: Good example: -![good example](https://i.postimg.cc/rzyv09gc/good-example.png) +![good example](https://i.postimg.cc/HWbHNZyJ/good-example.png) Bad example: -![bad example](https://i.postimg.cc/SYK5Byjj/bad-example.png) +![bad example](https://i.postimg.cc/qqcTCqTc/bad-example.png) ### Test Results