Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 45 additions & 39 deletions app/sidekiq/retrain_zoobot_job.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true
require 'bajor/client'
require 'honeybadger/ruby'

class RetrainZoobotJob
class Failure < StandardError; end
Expand All @@ -10,46 +11,51 @@ class Failure < StandardError; end
TRAINING_JOB_MONITOR = ENV.fetch('TRAINING_JOB_MONITOR', 10).to_i

def perform(context_id)
training_context = Context.find(context_id)

# see if we have a recent re-usable data export instead of making one each time
# the data should be similar and the window period is configurable
training_data_export = find_recent_training_data_export(training_context.workflow_id)

# if we haven't found a recent training data export then create one
unless training_data_export
training_data_export = TrainingDataExport.create!(
storage_path: TrainingDataExport.storage_path(training_context.workflow_id),
workflow_id: training_context.workflow_id
)

# run the export service code to create the training data export catalogue on blob storage system
Export::TrainingData.new(training_data_export).run
begin
training_context = Context.find(context_id)

# see if we have a recent re-usable data export instead of making one each time
# the data should be similar and the window period is configurable
training_data_export = find_recent_training_data_export(training_context.workflow_id)

# if we haven't found a recent training data export then create one
unless training_data_export
training_data_export = TrainingDataExport.create!(
storage_path: TrainingDataExport.storage_path(training_context.workflow_id),
workflow_id: training_context.workflow_id
)

# run the export service code to create the training data export catalogue on blob storage system
Export::TrainingData.new(training_data_export).run
end

# this is where we could intercept the training job submission
# to avoid a training run if there isn't enough data for a viable model
# one idea would be to check the number of rows in the training data export attached file
# or even better we store the number of exported rows in the training data export model
# https://github.com/zooniverse/kade/issues/62

# create a new training job record to track the batch training job
training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id)
# submit the export training job to the batch training service
# this updates the training job state
training_job = Batch::Training::CreateJob.new(training_job).run

# raise a failure here to rely on sidekiq to retry the job
# and notify us that there are issues with job submission
# Note: if this gets noisy we can look at silencing the error reporting
raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed?

# kick off a job monitor here that updates the training job resource with the job tasks results
# this background job will reschedule itself until the training job is completed
# and handle the completion / failure events for the training job
TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id)

training_job
rescue StandardError => e
Honeybadger.notify(e)
raise
end

# this is where we could intercept the training job submission
# to avoid a training run if there isn't enough data for a viable model
# one idea would be to check the number of rows in the training data export attached file
# or even better we store the number of exported rows in the training data export model
# https://github.com/zooniverse/kade/issues/62

# create a new training job record to track the batch training job
training_job = create_training_job(training_data_export.storage_path, training_context.workflow_id)
# submit the export training job to the batch training service
# this updates the training job state
training_job = Batch::Training::CreateJob.new(training_job).run

# raise a failure here to rely on sidekiq to retry the job
# and notify us that there are issues with job submission
# Note: if this gets noisy we can look at silencing the error reporting
raise Failure, "failure when submiting the training job with id: #{training_job.id}" if training_job.failed?

# kick off a job monitor here that updates the training job resource with the job tasks results
# this background job will reschedule itself until the training job is completed
# and handle the completion / failure events for the training job
TrainingJobMonitorJob.perform_in(TRAINING_JOB_MONITOR.minutes, training_job.id, training_context.id)

training_job
end

def find_recent_training_data_export(workflow_id)
Expand Down
6 changes: 6 additions & 0 deletions spec/services/batch/training/create_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
allow(bajor_client_double).to receive(:create_training_job).and_raise(Bajor::Client::Error, error_message)
end

it 'does not notify Honeybadger about the failure' do
allow(Honeybadger).to receive(:notify)
training_create_job.run
expect(Honeybadger).not_to have_received(:notify)
end

it 'stores the error message on the training job resource' do
expect {
training_create_job.run
Expand Down
20 changes: 20 additions & 0 deletions spec/sidekiq/retrain_zoobot_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
allow(Export::TrainingData).to receive(:new).and_return(export_training_data_double)
allow(batch_training_create_job_double).to receive(:run).and_return(training_job)
allow(TrainingJob).to receive(:create!).and_call_original
allow(Honeybadger).to receive(:notify)
allow(Batch::Training::CreateJob).to receive(:new).with(instance_of(TrainingJob)).and_return(batch_training_create_job_double)
end

Expand Down Expand Up @@ -55,6 +56,25 @@
expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure)
expect(TrainingJobMonitorJob).not_to have_received(:perform_in)
end

it 'notifies Honeybadger about the failed training job' do
expect { job.perform(context.id) }.to raise_error(RetrainZoobotJob::Failure)
expect(Honeybadger).to have_received(:notify).with(instance_of(RetrainZoobotJob::Failure))
end
end

context 'when training data export fails' do
let(:export_error) { Format::TrainingDataCsv::MissingLocationData.new('For subject id: 1') }

before do
allow(export_training_data_double).to receive(:run).and_raise(export_error)
end

it 'notifies Honeybadger before reraising' do
expect { job.perform(context.id) }.to raise_error(Format::TrainingDataCsv::MissingLocationData)

expect(Honeybadger).to have_received(:notify).with(export_error)
end
end

context 'with existing training data exports' do
Expand Down
Loading