diff --git a/app/sidekiq/retrain_zoobot_job.rb b/app/sidekiq/retrain_zoobot_job.rb index 994c06a..e5b53ac 100644 --- a/app/sidekiq/retrain_zoobot_job.rb +++ b/app/sidekiq/retrain_zoobot_job.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true require 'bajor/client' +require 'honeybadger/ruby' class RetrainZoobotJob class Failure < StandardError; end @@ -10,46 +11,51 @@ class Failure < StandardError; end TRAINING_JOB_MONITOR = ENV.fetch('TRAINING_JOB_MONITOR', 10).to_i def perform(context_id) - training_context = Context.find(context_id) - - # see if we have a recent re-usable data export instead of making one each time - # the data should be similar and the window period is configurable - training_data_export = find_recent_training_data_export(training_context.workflow_id) - - # if we haven't found a recent training data export then create one - unless training_data_export - training_data_export = TrainingDataExport.create!( - storage_path: TrainingDataExport.storage_path(training_context.workflow_id), - workflow_id: training_context.workflow_id - ) - - # run the export service code to create the training data export catalogue on blob storage system - Export::TrainingData.new(training_data_export).run + begin + training_context = Context.find(context_id) + + # see if we have a recent re-usable data export instead of making one each time + # the data should be similar and the window period is configurable + training_data_export = find_recent_training_data_export(training_context.workflow_id) + + # if we haven't found a recent training data export then create one + unless training_data_export + training_data_export = TrainingDataExport.create!( + storage_path: TrainingDataExport.storage_path(training_context.workflow_id), + workflow_id: training_context.workflow_id + ) + + # run the export service code to create the training data export catalogue on blob storage system + Export::TrainingData.new(training_data_export).run + end + + # this is where we could intercept the training job submission + # to avoid a training run if there isn't enough data for a viable model + # one idea would be to check the number of rows in the training data export attached file + # or even better we store the number of exported rows in the training data export model + # https://github.com/zooniverse/kade/issues/62 + + # create a new training job record to track the batch training job + training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id) + # submit the export training job to the batch training service + # this updates the training job state + training_job = Batch::Training::CreateJob.new(training_job).run + + # raise a failure here to rely on sidekiq to retry the job + # and notify us that there are issues with job submission + # Note: if this gets noisy we can look at silencing the error reporting + raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed? + + # kick off a job monitor here that updates the training job resource with the job tasks results + # this background job will reschedule itself until the training job is completed + # and handle the completion / failure events for the training job + TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id) + + training_job + rescue StandardError => e + Honeybadger.notify(e) + raise end - - # this is where we could intercept the training job submission - # to avoid a training run if there isn't enough data for a viable model - # one idea would be to check the number of rows in the training data export attached file - # or even better we store the number of exported rows in the training data export model - # https://github.com/zooniverse/kade/issues/62 - - # create a new training job record to track the batch training job - training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id) - # submit the export training job to the batch training service - # this updates the training job state - training_job = Batch::Training::CreateJob.new(training_job).run - - # raise a failure here to rely on sidekiq to retry the job - # and notify us that there are issues with job submission - # Note: if this gets noisy we can look at silencing the error reporting - raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed? - - # kick off a job monitor here that updates the training job resource with the job tasks results - # this background job will reschedule itself until the training job is completed - # and handle the completion / failure events for the training job - TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id) - - training_job end def find_recent_training_data_export(workflow_id) diff --git a/spec/services/batch/training/create_job_spec.rb b/spec/services/batch/training/create_job_spec.rb index 8fc2795..e280380 100644 --- a/spec/services/batch/training/create_job_spec.rb +++ b/spec/services/batch/training/create_job_spec.rb @@ -44,6 +44,12 @@ allow(bajor_client_double).to receive(:create_training_job).and_raise(Bajor::Client::Error, error_message) end + it 'does not notify Honeybadger about the failure' do + allow(Honeybadger).to receive(:notify) + training_create_job.run + expect(Honeybadger).not_to have_received(:notify) + end + it 'stores the error message on the training job resource' do expect { training_create_job.run diff --git a/spec/sidekiq/retrain_zoobot_job_spec.rb b/spec/sidekiq/retrain_zoobot_job_spec.rb index 25d211e..7e95358 100644 --- a/spec/sidekiq/retrain_zoobot_job_spec.rb +++ b/spec/sidekiq/retrain_zoobot_job_spec.rb @@ -17,6 +17,7 @@ allow(Export::TrainingData).to receive(:new).and_return(export_training_data_double) allow(batch_training_create_job_double).to receive(:run).and_return(training_job) allow(TrainingJob).to receive(:create!).and_call_original + allow(Honeybadger).to receive(:notify) allow(Batch::Training::CreateJob).to receive(:new).with(instance_of(TrainingJob)).and_return(batch_training_create_job_double) end @@ -55,6 +56,25 @@ expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure) expect(TrainingJobMonitorJob).not_to have_received(:perform_in) end + + it 'notifies Honeybadger about the failed training job' do + expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure) + expect(Honeybadger).to have_received(:notify).with(instance_of(RetrainZoobotJob::Failure)) + end + end + + context 'when training data export fails' do + let(:export_error) { Format::TrainingDataCsv::MissingLocationData.new('For subject id: 1') } + + before do + allow(export_training_data_double).to receive(:run).and_raise(export_error) + end + + it 'notifies Honeybadger before reraising' do + expect { job.perform(context.id) }.to raise_error(Format::TrainingDataCsv::MissingLocationData) + + expect(Honeybadger).to have_received(:notify).with(export_error) + end end context 'with existing training data exports' do