From 6183c5a37535a05c6c310b42f076df60224e1ed5 Mon Sep 17 00:00:00 2001 From: Renan DelValle Date: Mon, 14 Jan 2019 15:32:47 -0800 Subject: [PATCH] Addressing feedback. Monitors now return errors which provide context through behavior. Adding notes to the doc explaining what happens when AbortJob times out. --- errors.go | 13 +++++++++++++ monitors.go | 39 +++++++++++++-------------------------- realis.go | 8 +++----- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/errors.go b/errors.go index 7007f83..7411a5e 100644 --- a/errors.go +++ b/errors.go @@ -28,6 +28,19 @@ func IsTimeout(err error) bool { return ok && temp.Timedout() } +type timeoutErr struct { + error + timedout bool +} + +func (r *timeoutErr) Timedout() bool { + return r.timedout +} + +func newTimedoutError(err error) *timeoutErr { + return &timeoutErr{error: err, timedout: true} +} + // retryErr is a superset of timeout which includes extra context // with regards to our retry mechanism. This is done in order to make sure // that our retry mechanism works as expected through our tests and should diff --git a/monitors.go b/monitors.go index dea9cde..3106cc6 100644 --- a/monitors.go +++ b/monitors.go @@ -23,20 +23,13 @@ import ( "github.com/pkg/errors" ) -const ( - UpdateFailed = "update failed" - RolledBack = "update rolled back" - UpdateAborted = "update aborted" - Timeout = "timeout" - UpdateError = "update encountered an error" -) - type Monitor struct { Client Realis } // Polls the scheduler every certain amount of time to see if the update has succeeded func (m *Monitor) JobUpdate(updateKey aurora.JobUpdateKey, interval int, timeout int) (bool, error) { + status, err := m.JobUpdateStatus(updateKey, map[aurora.JobUpdateStatus]bool{ aurora.JobUpdateStatus_ROLLED_FORWARD: true, @@ -52,30 +45,24 @@ func (m *Monitor) JobUpdate(updateKey aurora.JobUpdateKey, interval int, timeout return false, err } + m.Client.RealisConfig().logger.Printf("job update status: %v\n", status) + // Rolled forward is the only state in which an update has been successfully updated // if we encounter an inactive state and it is not at rolled forward, update failed switch status { case aurora.JobUpdateStatus_ROLLED_FORWARD: - m.Client.RealisConfig().logger.Println("update succeeded") return true, nil - case aurora.JobUpdateStatus_ROLLED_BACK: - m.Client.RealisConfig().logger.Println(RolledBack) - return false, errors.New(RolledBack) - case aurora.JobUpdateStatus_ABORTED: - m.Client.RealisConfig().logger.Println(UpdateAborted) - return false, errors.New(UpdateAborted) - case aurora.JobUpdateStatus_ERROR: - m.Client.RealisConfig().logger.Println(UpdateError) - return false, errors.New(UpdateError) - case aurora.JobUpdateStatus_FAILED: - m.Client.RealisConfig().logger.Println(UpdateFailed) - return false, errors.New(UpdateFailed) + case aurora.JobUpdateStatus_ROLLED_BACK, aurora.JobUpdateStatus_ABORTED, aurora.JobUpdateStatus_ERROR, aurora.JobUpdateStatus_FAILED: + return false, errors.Errorf("bad terminal state for update: %v", status) default: - return false, nil + return false, errors.Errorf("unexpected update state: %v", status) } } -func (m *Monitor) JobUpdateStatus(updateKey aurora.JobUpdateKey, desiredStatuses map[aurora.JobUpdateStatus]bool, interval, timeout time.Duration) (aurora.JobUpdateStatus, error) { +func (m *Monitor) JobUpdateStatus(updateKey aurora.JobUpdateKey, + desiredStatuses map[aurora.JobUpdateStatus]bool, + interval time.Duration, + timeout time.Duration) (aurora.JobUpdateStatus, error) { updateQ := aurora.JobUpdateQuery{ Key: &updateKey, @@ -109,7 +96,7 @@ func (m *Monitor) JobUpdateStatus(updateKey aurora.JobUpdateKey, desiredStatuses } case <-timer.C: - return aurora.JobUpdateStatus(-1), errors.New(Timeout) + return aurora.JobUpdateStatus(-1), newTimedoutError(errors.New("job update monitor timed out")) } } } @@ -144,7 +131,7 @@ func (m *Monitor) ScheduleStatus(key *aurora.JobKey, instanceCount int32, desire case <-timer.C: // If the timer runs out, return a timeout error to user - return false, errors.New(Timeout) + return false, newTimedoutError(errors.New("schedule status monitor timed out")) } } } @@ -204,7 +191,7 @@ func (m *Monitor) HostMaintenance(hosts []string, modes []aurora.MaintenanceMode hostResult[host] = false } - return hostResult, errors.New(Timeout) + return hostResult, newTimedoutError(errors.New("host maintenance monitor timed out")) } } } diff --git a/realis.go b/realis.go index 9427ecf..9acf364 100644 --- a/realis.go +++ b/realis.go @@ -775,6 +775,8 @@ func (r *realisClient) StartJobUpdate(updateJob *UpdateJob, message string) (*au } // Abort Job Update on Aurora. Requires the updateId which can be obtained on the Aurora web UI. +// This API is meant to be synchronous. It will attempt to wait until the update transitions to the aborted state. +// However, if the job update does not transition to the ABORT state an error will be returned. func (r *realisClient) AbortJobUpdate(updateKey aurora.JobUpdateKey, message string) (*aurora.Response, error) { r.logger.DebugPrintf("AbortJobUpdate Thrift Payload: %+v %v\n", updateKey, message) @@ -791,11 +793,7 @@ func (r *realisClient) AbortJobUpdate(updateKey aurora.JobUpdateKey, message str m := Monitor{Client: r} _, err := m.JobUpdateStatus(updateKey, map[aurora.JobUpdateStatus]bool{aurora.JobUpdateStatus_ABORTED: true}, time.Second*5, time.Minute) - if err != nil { - return resp, err - } else { - return resp, nil - } + return resp, err } //Pause Job Update. UpdateID is returned from StartJobUpdate or the Aurora web UI.