zooniverse · Tooyosi · May 13, 2026 · Apr 30, 2026
diff --git a/app/sidekiq/retrain_zoobot_job.rb b/app/sidekiq/retrain_zoobot_job.rb
@@ -1,5 +1,6 @@
 # frozen_string_literal: true
 require 'bajor/client'
+require 'honeybadger/ruby'
 
 class RetrainZoobotJob
   class Failure < StandardError; end
@@ -10,46 +11,51 @@ class Failure < StandardError; end
   TRAINING_JOB_MONITOR = ENV.fetch('TRAINING_JOB_MONITOR', 10).to_i
 
   def perform(context_id)
-    training_context = Context.find(context_id)
-
-    # see if we have a recent re-usable data export instead of making one each time
-    # the data should be similar and the window period is configurable
-    training_data_export = find_recent_training_data_export(training_context.workflow_id)
-
-    # if we haven't found a recent training data export then create one
-    unless training_data_export
-      training_data_export = TrainingDataExport.create!(
-        storage_path: TrainingDataExport.storage_path(training_context.workflow_id),
-        workflow_id: training_context.workflow_id
-      )
-
-      # run the export service code to create the training data export catalogue on blob storage system
-      Export::TrainingData.new(training_data_export).run
+    begin
+      training_context = Context.find(context_id)
+
+      # see if we have a recent re-usable data export instead of making one each time
+      # the data should be similar and the window period is configurable
+      training_data_export = find_recent_training_data_export(training_context.workflow_id)
+
+      # if we haven't found a recent training data export then create one
+      unless training_data_export
+        training_data_export = TrainingDataExport.create!(
+          storage_path: TrainingDataExport.storage_path(training_context.workflow_id),
+          workflow_id: training_context.workflow_id
+        )
+
+        # run the export service code to create the training data export catalogue on blob storage system
+        Export::TrainingData.new(training_data_export).run
+      end
+
+      # this is where we could intercept the training job submission
+      # to avoid a training run if there isn't enough data for a viable model
+      # one idea would be to check the number of rows in the training data export attached file
+      # or even better we store the number of exported rows in the training data export model
+      # https://github.com/zooniverse/kade/issues/62
+
+      # create a new training job record to track the batch training job
+      training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id)
+      # submit the export training job to the batch training service
+      # this updates the training job state
+      training_job = Batch::Training::CreateJob.new(training_job).run
+
+      # raise a failure here to rely on sidekiq to retry the job
+      # and notify us that there are issues with job submission
+      # Note: if this gets noisy we can look at silencing the error reporting
+      raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed?
+
+      # kick off a job monitor here that updates the training job resource with the job tasks results
+      # this background job will reschedule itself until the training job is completed
+      # and handle the completion / failure events for the training job
+      TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id)
+
+      training_job
+    rescue StandardError => e
+      Honeybadger.notify(e)
+      raise
     end
-
-    # this is where we could intercept the training job submission
-    # to avoid a training run if there isn't enough data for a viable model
-    # one idea would be to check the number of rows in the training data export attached file
-    # or even better we store the number of exported rows in the training data export model
-    # https://github.com/zooniverse/kade/issues/62
-
-    # create a new training job record to track the batch training job
-    training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id)
-    # submit the export training job to the batch training service
-    # this updates the training job state
-    training_job = Batch::Training::CreateJob.new(training_job).run
-
-    # raise a failure here to rely on sidekiq to retry the job
-    # and notify us that there are issues with job submission
-    # Note: if this gets noisy we can look at silencing the error reporting
-    raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed?
-
-    # kick off a job monitor here that updates the training job resource with the job tasks results
-    # this background job will reschedule itself until the training job is completed
-    # and handle the completion / failure events for the training job
-    TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id)
-
-    training_job
   end
 
   def find_recent_training_data_export(workflow_id)

diff --git a/spec/services/batch/training/create_job_spec.rb b/spec/services/batch/training/create_job_spec.rb
@@ -44,6 +44,12 @@
         allow(bajor_client_double).to receive(:create_training_job).and_raise(Bajor::Client::Error, error_message)
       end
 
+      it 'does not notify Honeybadger about the failure' do
+        allow(Honeybadger).to receive(:notify)
+        training_create_job.run
+        expect(Honeybadger).not_to have_received(:notify)
+      end
+
       it 'stores the error message on the training job resource' do
         expect {
           training_create_job.run

diff --git a/spec/sidekiq/retrain_zoobot_job_spec.rb b/spec/sidekiq/retrain_zoobot_job_spec.rb
@@ -17,6 +17,7 @@
       allow(Export::TrainingData).to receive(:new).and_return(export_training_data_double)
       allow(batch_training_create_job_double).to receive(:run).and_return(training_job)
       allow(TrainingJob).to receive(:create!).and_call_original
+      allow(Honeybadger).to receive(:notify)
       allow(Batch::Training::CreateJob).to receive(:new).with(instance_of(TrainingJob)).and_return(batch_training_create_job_double)
     end
 
@@ -55,6 +56,25 @@
         expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure)
         expect(TrainingJobMonitorJob).not_to have_received(:perform_in)
       end
+
+      it 'notifies Honeybadger about the failed training job' do
+        expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure)
+        expect(Honeybadger).to have_received(:notify).with(instance_of(RetrainZoobotJob::Failure))
+      end
+    end
+
+    context 'when training data export fails' do
+      let(:export_error) { Format::TrainingDataCsv::MissingLocationData.new('For subject id: 1') }
+
+      before do
+        allow(export_training_data_double).to receive(:run).and_raise(export_error)
+      end
+
+      it 'notifies Honeybadger before reraising' do
+        expect { job.perform(context.id) }.to raise_error(Format::TrainingDataCsv::MissingLocationData)
+
+        expect(Honeybadger).to have_received(:notify).with(export_error)
+      end
     end
 
     context 'with existing training data exports' do