Thread safety, misc fixes, and refactoring (#51)

* Changing incorrect license in some source files.

* Changing CreateService to mimic CreateJob by setting the batch size to the instance count.

* Changing Getcerts to GetCerts to match the style of the rest of the codebase.

* Overhauled error handling. Backoff now recognizes temporary errors and continues to retry if it finds one.

* Changed thrift function call wrapper to be more explicitly named and to perform more safety checks.

* Moved Jitter function from realis to retry.

* API code is now more uniform and follows a certain template.

* Lock added whenever a thrift call is made or when a modification is done to the connection. Note that calling ReestablishConn externally may result in some race conditions. We will move to make this function private in the near future.

* Added test for Realis session thread safety. Tested ScheduleStatus monitor. Tested monitor timing out.

* Returning nil whenever there is an error return so that there are no ambiguities.

* Using defer with unlock so that the lock is still released if a panic is invoked.
This commit is contained in:
Renan DelValle 2018-01-21 19:30:01 -08:00 committed by GitHub
parent b2ffb73183
commit a941bcb679
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 256 additions and 245 deletions

377
realis.go
View file

@ -21,12 +21,13 @@ import (
"encoding/base64"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"net/http/cookiejar"
"path/filepath"
"time"
"sync"
"git.apache.org/thrift.git/lib/go/thrift"
"github.com/paypal/gorealis/gen-go/apache/aurora"
"github.com/paypal/gorealis/response"
@ -56,6 +57,7 @@ type Realis interface {
ScheduleCronJob(auroraJob Job) (*aurora.Response, error)
StartJobUpdate(updateJob *UpdateJob, message string) (*aurora.Response, error)
StartCronJob(key *aurora.JobKey) (*aurora.Response, error)
// TODO: Remove this method and make it private to avoid race conditions
ReestablishConn() error
RealisConfig() *RealisConfig
Close()
@ -72,6 +74,7 @@ type realisClient struct {
readonlyClient *aurora.ReadOnlySchedulerClient
adminClient *aurora.AuroraAdminClient
logger Logger
lock sync.Mutex
}
type RealisConfig struct {
@ -285,19 +288,6 @@ func NewRealisClient(options ...ClientOption) (Realis, error) {
}
// Jitter returns a time.Duration between duration and duration + maxFactor *
// duration.
//
// This allows clients to avoid converging on periodic behavior. If maxFactor
// is 0.0, a suggested default value will be chosen.
func Jitter(duration time.Duration, maxFactor float64) time.Duration {
if maxFactor <= 0.0 {
maxFactor = 1.0
}
wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
return wait
}
func GetDefaultClusterFromZKUrl(zkurl string) *Cluster {
return &Cluster{
Name: "defaultCluster",
@ -309,7 +299,7 @@ func GetDefaultClusterFromZKUrl(zkurl string) *Cluster {
}
}
func Getcerts(certpath string) (*x509.CertPool, error) {
func GetCerts(certpath string) (*x509.CertPool, error) {
globalRootCAs := x509.NewCertPool()
caFiles, err := ioutil.ReadDir(certpath)
if err != nil {
@ -339,7 +329,7 @@ func defaultTTransport(urlstr string, timeoutms int, config *RealisConfig) (thri
tlsConfig.InsecureSkipVerify = true
}
if config.certspath != "" {
rootCAs, err := Getcerts("examples/certs")
rootCAs, err := GetCerts("examples/certs")
if err != nil {
config.logger.Println("error occured couldn't fetch certs")
return nil, err
@ -428,15 +418,51 @@ func basicAuth(username, password string) string {
return base64.StdEncoding.EncodeToString([]byte(auth))
}
type auroraThriftCall func() (resp *aurora.Response, err error)
// Takes a Thrift API function call and returns response and error.
// If Error from the API call is retryable, the functions re-establishes the connection with Aurora by
// using the same configuration used by the original client. Locks usage of and changes to client connection in order
// to make realis sessions thread safe.
func (r *realisClient) thriftCallHelper(auroraCall auroraThriftCall) (*aurora.Response, error) {
// Only allow one go-routine make use or modify the thrift client connection
r.lock.Lock()
defer r.lock.Unlock()
resp, cliErr := auroraCall()
if cliErr != nil {
// Re-establish conn returns a temporary error or nil
// as we can always retry to connect to the scheduler.
retryConnErr := r.ReestablishConn()
return resp, retryConnErr
}
if resp == nil {
return nil, errors.New("Response is nil")
}
if resp.GetResponseCode() == aurora.ResponseCode_ERROR_TRANSIENT {
return resp, NewTemporaryError(errors.New("Aurora scheduler temporarily unavailable"))
}
if resp.GetResponseCode() != aurora.ResponseCode_OK {
return nil, errors.New(response.CombineMessage(resp))
}
return resp, nil
}
func (r *realisClient) ReestablishConn() error {
// Close existing connection
r.logger.Println("ReestablishConn begin ....")
r.logger.Println("Re-establishing Connection to Aurora")
r.Close()
// Recreate connection from scratch using original options
newRealis, err := NewRealisClient(r.config.options...)
if err != nil {
return err
// This could be a temporary network hiccup
return NewTemporaryError(err)
}
// If we are able to successfully re-connect, make receiver
@ -471,25 +497,25 @@ func (r *realisClient) GetInstanceIds(key *aurora.JobKey, states map[aurora.Sche
var resp *aurora.Response
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.GetTasksWithoutConfigs(taskQ)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
// Pass error directly to backoff which makes exceptions for temporary errors
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
// If we encountered an error we couldn't recover from by retrying, return an error to the user
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error querying Aurora Scheduler for active IDs")
}
resp, clientErr = response.ResponseCodeCheck(resp)
if clientErr != nil {
return nil, clientErr
}
// Construct instance id map to stay in line with thrift's representation of sets
tasks := response.ScheduleStatusResult(resp).GetTasks()
jobInstanceIds := make(map[int32]bool)
for _, task := range tasks {
@ -504,23 +530,22 @@ func (r *realisClient) GetJobUpdateSummaries(jobUpdateQuery *aurora.JobUpdateQue
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.readonlyClient.GetJobUpdateSummaries(jobUpdateQuery)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error getting job update summaries from Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// Kill specific instances of a job.
@ -535,22 +560,21 @@ func (r *realisClient) KillInstances(key *aurora.JobKey, instances ...int32) (*a
}
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.KillTasks(key, instanceIds, "")
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Kill command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
func (r *realisClient) RealisConfig() *RealisConfig {
@ -570,21 +594,21 @@ func (r *realisClient) KillJob(key *aurora.JobKey) (*aurora.Response, error) {
if len(instanceIds) > 0 {
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.KillTasks(key, instanceIds, "")
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(err, retryErr.Error()+"Error sending Kill command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(err, retryErr.Error()+": Error sending Kill command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
return nil, errors.New("No tasks in the Active state")
}
@ -598,23 +622,21 @@ func (r *realisClient) CreateJob(auroraJob Job) (*aurora.Response, error) {
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.CreateJob(auroraJob.JobConfig())
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Create command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Create command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// This API uses an update thrift call to create the services giving a few more robust features.
@ -622,6 +644,7 @@ func (r *realisClient) CreateService(auroraJob Job, settings UpdateSettings) (*a
// Create a new job update object and ship it to the StartJobUpdate api
update := NewUpdateJob(auroraJob.TaskConfig(), &settings.settings)
update.InstanceCount(auroraJob.GetInstanceCount())
update.BatchSize(auroraJob.GetInstanceCount())
resp, err := r.StartJobUpdate(update, "")
if err != nil {
@ -640,22 +663,21 @@ func (r *realisClient) ScheduleCronJob(auroraJob Job) (*aurora.Response, error)
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.ScheduleCronJob(auroraJob.JobConfig())
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Cron Job Schedule message to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Cron Job Schedule message to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
func (r *realisClient) DescheduleCronJob(key *aurora.JobKey) (*aurora.Response, error) {
@ -664,22 +686,21 @@ func (r *realisClient) DescheduleCronJob(key *aurora.JobKey) (*aurora.Response,
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.DescheduleCronJob(key)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Cron Job De-schedule message to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Cron Job De-schedule message to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
@ -688,22 +709,21 @@ func (r *realisClient) StartCronJob(key *aurora.JobKey) (*aurora.Response, error
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.StartCronJob(key)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Start Cron Job message to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Start Cron Job message to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
@ -718,22 +738,21 @@ func (r *realisClient) RestartInstances(key *aurora.JobKey, instances ...int32)
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.RestartShards(key, instanceIds)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Restart command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Restart command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// Restarts all active tasks under a job configuration.
@ -747,22 +766,22 @@ func (r *realisClient) RestartJob(key *aurora.JobKey) (*aurora.Response, error)
var clientErr error
if len(instanceIds) > 0 {
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.RestartShards(key, instanceIds)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending Restart command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending Restart command to Aurora Scheduler")
}
return resp, nil
} else {
return nil, errors.New("No tasks in the Active state")
}
@ -775,22 +794,21 @@ func (r *realisClient) StartJobUpdate(updateJob *UpdateJob, message string) (*au
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.StartJobUpdate(updateJob.req, message)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending StartJobUpdate command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending StartJobUpdate command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// Abort Job Update on Aurora. Requires the updateId which can be obtained on the Aurora web UI.
@ -801,22 +819,21 @@ func (r *realisClient) AbortJobUpdate(
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.AbortJobUpdate(&updateKey, message)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending AbortJobUpdate command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending AbortJobUpdate command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// Scale up the number of instances under a job configuration using the configuration for specific
@ -827,22 +844,21 @@ func (r *realisClient) AddInstances(instKey aurora.InstanceKey, count int32) (*a
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.AddInstances(&instKey, count)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending AddInstances command to Aurora Scheduler")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error sending AddInstances command to Aurora Scheduler")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
@ -875,24 +891,19 @@ func (r *realisClient) GetTaskStatus(query *aurora.TaskQuery) (tasks []*aurora.S
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.GetTasksStatus(query)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task status")
}
//Check for response code..
if resp.GetResponseCode() != aurora.ResponseCode_OK {
return nil, errors.New(resp.ResponseCode.String() + "--" + response.CombineMessage(resp))
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error querying Aurora Scheduler for task status")
}
return response.ScheduleStatusResult(resp).GetTasks(), nil
@ -904,24 +915,19 @@ func (r *realisClient) GetTasksWithoutConfigs(query *aurora.TaskQuery) (tasks []
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.GetTasksWithoutConfigs(query)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task status without configs")
}
//Check for response code..
if resp.GetResponseCode() != aurora.ResponseCode_OK {
return nil, errors.New(resp.ResponseCode.String() + "--" + response.CombineMessage(resp))
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error querying Aurora Scheduler for task status without configs")
}
return response.ScheduleStatusResult(resp).GetTasks(), nil
@ -946,24 +952,19 @@ func (r *realisClient) FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.Task
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.GetTasksStatus(taskQ)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task configuration")
}
//Check for response code..
if resp.GetResponseCode() != aurora.ResponseCode_OK {
return nil, errors.New(resp.ResponseCode.String() + "--" + response.CombineMessage(resp))
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Error querying Aurora Scheduler for task configuration")
}
tasks := response.ScheduleStatusResult(resp).GetTasks()
@ -986,22 +987,21 @@ func (r *realisClient) JobUpdateDetails(updateQuery aurora.JobUpdateQuery) (*aur
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.GetJobUpdateDetails(&updateQuery)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to get job update details")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Unable to get job update details")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
@ -1010,22 +1010,21 @@ func (r *realisClient) RollbackJobUpdate(key aurora.JobUpdateKey, message string
var clientErr error
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.client.RollbackJobUpdate(&key, message)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to roll back job update")
if retryErr != nil {
return nil, errors.Wrap(clientErr, retryErr.Error()+": Unable to roll back job update")
}
return response.ResponseCodeCheck(resp)
return resp, nil
}
// Set a list of nodes to DRAINING. This means nothing will be able to be scheduled on them and any existing
@ -1048,26 +1047,25 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
}
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.adminClient.DrainHosts(drainList)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to recover connection")
}
if resp != nil && resp.GetResult_() != nil {
result = resp.GetResult_().GetDrainHostsResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(clientErr, retryErr.Error()+": Unable to recover connection")
}
return resp, result, nil
}
@ -1088,26 +1086,25 @@ func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *auror
}
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.adminClient.EndMaintenance(hostList)
})
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
return false, nil
}
if clientErr != nil {
return false, clientErr
}
return true, nil
})
if clientErr != nil {
return nil, nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to recover connection")
}
if resp != nil && resp.GetResult_() != nil {
result = resp.GetResult_().GetEndMaintenanceResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(clientErr, retryErr.Error()+": Unable to recover connection")
}
return resp, result, nil
}
@ -1115,7 +1112,7 @@ func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *au
var resp *aurora.Response
var result *aurora.MaintenanceStatusResult_
var returnErr, clientErr, payloadErr error
var clientErr error
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to get maintenance status from")
@ -1129,23 +1126,14 @@ func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *au
retryErr := ExponentialBackoff(defaultBackoff, func() (bool, error) {
// Send thrift call, if we have a thrift send error, attempt to reconnect
// and continue trying to resend command
if resp, clientErr = r.adminClient.MaintenanceStatus(hostList); clientErr != nil {
// Experienced an connection error
err1 := r.ReestablishConn()
if err1 != nil {
r.logger.Println("error in re-establishing connection: ", err1)
}
return false, nil
}
// Make thrift call. If we encounter an error sending the call, attempt to reconnect
// and continue trying to resend command until we run out of retries.
resp, clientErr = r.thriftCallHelper(func() (*aurora.Response, error) {
return r.adminClient.MaintenanceStatus(hostList)
})
// If error is NOT due to connection
if _, payloadErr = response.ResponseCodeCheck(resp); payloadErr != nil {
// TODO(rdelvalle): an leader election may cause the response to have
// failed when it should have succeeded. Retry everything for now until
// we figure out a more concrete fix.
return false, nil
if clientErr != nil {
return false, clientErr
}
// Successful call
@ -1157,20 +1145,9 @@ func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *au
result = resp.GetResult_().GetMaintenanceStatusResult_()
}
// Prioritize returning a bad payload error over a client error as a bad payload error indicates
// a deeper issue
if payloadErr != nil {
returnErr = payloadErr
} else {
returnErr = clientErr
}
// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
// this will can become either a timeout error or a payload error
if retryErr != nil {
return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
return resp, result, errors.Wrap(clientErr, retryErr.Error()+": Unable to recover connection")
}
return resp, result, nil
}