merge retry mechanism change from gorealis v1 to gorealis v2 (#21)

This commit is contained in:
lawwong1 2023-01-26 13:36:40 -08:00 committed by GitHub
parent 8db625730f
commit d2fd7b9ba9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 449 additions and 206 deletions

236
realis.go
View file

@ -315,11 +315,13 @@ func (c *Client) GetInstanceIds(key aurora.JobKey, states []aurora.ScheduleStatu
Statuses: states,
}
c.logger.DebugPrintf("GetTasksWithoutConfigs Thrift Payload: %+v\n", taskQ)
c.logger.DebugPrintf("GetInstanceIds Thrift Payload: %+v\n", taskQ)
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetTasksWithoutConfigs(context.TODO(), taskQ)
})
},
nil,
)
// If we encountered an error we couldn't recover from by retrying, return an error to the user
if retryErr != nil {
@ -341,8 +343,13 @@ func (c *Client) GetJobUpdateSummaries(jobUpdateQuery *aurora.JobUpdateQuery) (*
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.readonlyClient.GetJobUpdateSummaries(context.TODO(), jobUpdateQuery)
})
},
nil,
)
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetGetJobUpdateSummariesResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error getting job update summaries from Aurora Scheduler")
}
@ -354,8 +361,12 @@ func (c *Client) GetJobSummary(role string) (*aurora.JobSummaryResult_, error) {
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.readonlyClient.GetJobSummary(context.TODO(), role)
})
},
nil,
)
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetJobSummaryResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error getting job summaries from Aurora Scheduler")
}
@ -365,21 +376,20 @@ func (c *Client) GetJobSummary(role string) (*aurora.JobSummaryResult_, error) {
func (c *Client) GetJobs(role string) (*aurora.GetJobsResult_, error) {
var result *aurora.GetJobsResult_
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.readonlyClient.GetJobs(context.TODO(), role)
})
},
nil,
)
if retryErr != nil {
return result, errors.Wrap(retryErr, "error getting Jobs from Aurora Scheduler")
return nil, errors.Wrap(retryErr, "error getting Jobs from Aurora Scheduler")
}
if resp == nil || resp.GetResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetJobsResult_
}
return result, nil
return resp.GetResult_().GetJobsResult_, nil
}
// Kill specific instances of a job. Returns true, nil if a task was actually killed as a result of this API call.
@ -389,19 +399,19 @@ func (c *Client) KillInstances(key aurora.JobKey, instances ...int32) (bool, err
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.KillTasks(context.TODO(), &key, instances, "")
})
},
nil,
)
if retryErr != nil {
return false, errors.Wrap(retryErr, "error sending Kill command to Aurora Scheduler")
}
if len(resp.GetDetails()) > 0 {
if resp == nil || len(resp.GetDetails()) > 0 {
c.logger.Println("KillTasks was called but no tasks killed as a result.")
return false, nil
} else {
return true, nil
}
return true, nil
}
func (c *Client) RealisConfig() *clientConfig {
@ -416,7 +426,9 @@ func (c *Client) KillJob(key aurora.JobKey) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
// Giving the KillTasks thrift call an empty set tells the Aurora scheduler to kill all active shards
return c.client.KillTasks(context.TODO(), &key, nil, "")
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Kill command to Aurora Scheduler")
@ -438,9 +450,27 @@ func (c *Client) CreateJob(auroraJob *AuroraJob) error {
return errors.Wrap(err, "unable to create Thermos payload")
}
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.CreateJob(context.TODO(), auroraJob.JobConfig())
})
// Response is checked by the thrift retry code
_, retryErr := c.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return c.client.CreateJob(context.TODO(), auroraJob.JobConfig())
},
// On a client timeout, attempt to verify that payload made to the Scheduler by
// trying to get the config summary for the job key
func() (*aurora.Response, bool) {
exists, err := c.JobExists(auroraJob.JobKey())
if err != nil {
c.logger.Print("verification failed ", err)
}
if exists {
return &aurora.Response{ResponseCode: aurora.ResponseCode_OK}, true
}
return nil, false
},
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Create command to Aurora Scheduler")
@ -471,7 +501,9 @@ func (c *Client) ScheduleCronJob(auroraJob *AuroraJob) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.ScheduleCronJob(context.TODO(), auroraJob.JobConfig())
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Cron AuroraJob Schedule message to Aurora Scheduler")
@ -485,7 +517,9 @@ func (c *Client) DescheduleCronJob(key aurora.JobKey) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.DescheduleCronJob(context.TODO(), &key)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Cron AuroraJob De-schedule message to Aurora Scheduler")
@ -501,7 +535,9 @@ func (c *Client) StartCronJob(key aurora.JobKey) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.StartCronJob(context.TODO(), &key)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Start Cron AuroraJob message to Aurora Scheduler")
@ -516,7 +552,9 @@ func (c *Client) RestartInstances(key aurora.JobKey, instances ...int32) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.RestartShards(context.TODO(), &key, instances)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Restart command to Aurora Scheduler")
@ -537,16 +575,17 @@ func (c *Client) RestartJob(key aurora.JobKey) error {
if len(instanceIds) > 0 {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.RestartShards(context.TODO(), &key, instanceIds)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending Restart command to Aurora Scheduler")
}
return nil
} else {
return errors.New("no tasks in the Active state")
}
return errors.New("no tasks in the Active state")
}
// Update all tasks under a job configuration. Currently gorealis doesn't support for canary deployments.
@ -558,34 +597,80 @@ func (c *Client) StartJobUpdate(updateJob *JobUpdate, message string) (*aurora.S
c.logger.DebugPrintf("StartJobUpdate Thrift Payload: %+v %v\n", updateJob, message)
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.StartJobUpdate(nil, updateJob.request, message)
})
resp, retryErr := c.thriftCallWithRetries(false,
func() (*aurora.Response, error) {
return c.client.StartJobUpdate(context.TODO(), updateJob.request, message)
},
func() (*aurora.Response, bool) {
key := updateJob.JobKey()
summariesResp, err := c.readonlyClient.GetJobUpdateSummaries(
context.TODO(),
&aurora.JobUpdateQuery{
JobKey: &key,
UpdateStatuses: aurora.ACTIVE_JOB_UPDATE_STATES,
Limit: 1,
})
if err != nil {
c.logger.Print("verification failed ", err)
return nil, false
}
summaries := response.JobUpdateSummaries(summariesResp)
if len(summaries) == 0 {
return nil, false
}
return &aurora.Response{
ResponseCode: aurora.ResponseCode_OK,
Result_: &aurora.Result_{
StartJobUpdateResult_: &aurora.StartJobUpdateResult_{
UpdateSummary: summaries[0],
Key: summaries[0].Key,
},
},
}, true
},
)
if retryErr != nil {
// A timeout took place when attempting this call, attempt to recover
if IsTimeout(retryErr) {
return nil, retryErr
}
return nil, errors.Wrap(retryErr, "error sending StartJobUpdate command to Aurora Scheduler")
}
if resp.GetResult_() != nil && resp.GetResult_().GetStartJobUpdateResult_() != nil {
return resp.GetResult_().GetStartJobUpdateResult_(), nil
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetStartJobUpdateResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
return nil, errors.New("thrift error: Field in response is nil unexpectedly.")
return resp.GetResult_().GetStartJobUpdateResult_(), nil
}
// Abort AuroraJob Update on Aurora. Requires the updateId which can be obtained on the Aurora web UI.
// AbortJobUpdate terminates a job update in the scheduler.
// It requires the updateId which can be obtained on the Aurora web UI.
// This API is meant to be synchronous. It will attempt to wait until the update transitions to the aborted state.
// However, if the job update does not transition to the ABORT state an error will be returned.
func (c *Client) AbortJobUpdate(updateKey aurora.JobUpdateKey, message string) error {
c.logger.DebugPrintf("AbortJobUpdate Thrift Payload: %+v %v\n", updateKey, message)
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.AbortJobUpdate(context.TODO(), &updateKey, message)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending AbortJobUpdate command to Aurora Scheduler")
}
return nil
// Make this call synchronous by blocking until it job has successfully transitioned to aborted
_, err := c.MonitorJobUpdateStatus(
updateKey,
[]aurora.JobUpdateStatus{aurora.JobUpdateStatus_ABORTED},
time.Second*5,
time.Minute)
return err
}
// Pause AuroraJob Update. UpdateID is returned from StartJobUpdate or the Aurora web UI.
@ -605,7 +690,9 @@ func (c *Client) PauseJobUpdate(updateKey *aurora.JobUpdateKey, message string)
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.PauseJobUpdate(nil, updateKeyLocal, message)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending PauseJobUpdate command to Aurora Scheduler")
@ -632,7 +719,9 @@ func (c *Client) ResumeJobUpdate(updateKey aurora.JobUpdateKey, message string)
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.ResumeJobUpdate(context.TODO(), &updateKey, message)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending ResumeJobUpdate command to Aurora Scheduler")
@ -653,18 +742,19 @@ func (c *Client) PulseJobUpdate(updateKey aurora.JobUpdateKey) (aurora.JobUpdate
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.PulseJobUpdate(context.TODO(), &updateKey)
})
},
nil,
)
if retryErr != nil {
return aurora.JobUpdatePulseStatus(0), errors.Wrap(retryErr, "error sending PulseJobUpdate command to Aurora Scheduler")
}
if resp.GetResult_() != nil && resp.GetResult_().GetPulseJobUpdateResult_() != nil {
return resp.GetResult_().GetPulseJobUpdateResult_().GetStatus(), nil
} else {
return aurora.JobUpdatePulseStatus(0), errors.New("thrift error, field was nil unexpectedly")
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetPulseJobUpdateResult_() == nil {
return aurora.JobUpdatePulseStatus(0), errors.New("unexpected response from scheduler")
}
return resp.GetResult_().GetPulseJobUpdateResult_().GetStatus(), nil
}
// Scale up the number of instances under a job configuration using the configuration for specific
@ -681,7 +771,9 @@ func (c *Client) AddInstances(instKey aurora.InstanceKey, count int32) error {
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.AddInstances(context.TODO(), &instKey, count)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "error sending AddInstances command to Aurora Scheduler")
@ -726,11 +818,16 @@ func (c *Client) GetTaskStatus(query *aurora.TaskQuery) ([]*aurora.ScheduledTask
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetTasksStatus(context.TODO(), query)
})
},
nil,
)
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error querying Aurora Scheduler for task status")
}
if resp == nil {
return nil, errors.New("unexpected response from scheduler")
}
return response.ScheduleStatusResult(resp).GetTasks(), nil
}
@ -742,29 +839,32 @@ func (c *Client) GetPendingReason(query *aurora.TaskQuery) ([]*aurora.PendingRea
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetPendingReason(context.TODO(), query)
})
},
nil,
)
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error querying Aurora Scheduler for pending Reasons")
}
var result []*aurora.PendingReason
if resp.GetResult_() != nil {
result = resp.GetResult_().GetGetPendingReasonResult_().GetReasons()
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetGetPendingReasonResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
return result, nil
return resp.GetResult_().GetGetPendingReasonResult_().GetReasons(), nil
}
// Get information about task including without a task configuration object
// GetTasksWithoutConfigs gets information about task including without a task configuration object.
// This is a more lightweight version of GetTaskStatus but contains less information as a result.
func (c *Client) GetTasksWithoutConfigs(query *aurora.TaskQuery) ([]*aurora.ScheduledTask, error) {
c.logger.DebugPrintf("GetTasksWithoutConfigs Thrift Payload: %+v\n", query)
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetTasksWithoutConfigs(context.TODO(), query)
})
},
nil,
)
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error querying Aurora Scheduler for task status without configs")
@ -791,7 +891,9 @@ func (c *Client) FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.TaskConfig
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetTasksStatus(context.TODO(), taskQ)
})
},
nil,
)
if retryErr != nil {
return nil, errors.Wrap(retryErr, "error querying Aurora Scheduler for task configuration")
@ -817,17 +919,19 @@ func (c *Client) JobUpdateDetails(updateQuery aurora.JobUpdateQuery) ([]*aurora.
resp, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.GetJobUpdateDetails(context.TODO(), &updateQuery)
})
},
nil,
)
if retryErr != nil {
return nil, errors.Wrap(retryErr, "unable to get job update details")
}
if resp.GetResult_() != nil && resp.GetResult_().GetGetJobUpdateDetailsResult_() != nil {
return resp.GetResult_().GetGetJobUpdateDetailsResult_().GetDetailsList(), nil
} else {
return nil, errors.New("unknown Thrift error, field is nil.")
if resp == nil || resp.GetResult_() == nil || resp.GetResult_().GetGetJobUpdateDetailsResult_() == nil {
return nil, errors.New("unexpected response from scheduler")
}
return resp.GetResult_().GetGetJobUpdateDetailsResult_().GetDetailsList(), nil
}
func (c *Client) RollbackJobUpdate(key aurora.JobUpdateKey, message string) error {
@ -836,7 +940,9 @@ func (c *Client) RollbackJobUpdate(key aurora.JobUpdateKey, message string) erro
_, retryErr := c.thriftCallWithRetries(false, func() (*aurora.Response, error) {
return c.client.RollbackJobUpdate(context.TODO(), &key, message)
})
},
nil,
)
if retryErr != nil {
return errors.Wrap(retryErr, "unable to roll back job update")