From dabd7418a9723f816475224589cd51cc19965367 Mon Sep 17 00:00:00 2001 From: Renan DelValle Date: Mon, 29 Oct 2018 15:03:27 -0700 Subject: [PATCH] Detecting if the transport error was not temporary in which case we stop retrying. Changed bug where get results was being called before we checked for an error. --- realis.go | 44 ++++++++++++++++++++++---------------------- retry.go | 15 +++++++++++++++ 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/realis.go b/realis.go index 9972a2a..f934565 100644 --- a/realis.go +++ b/realis.go @@ -555,14 +555,14 @@ func (r *realisClient) GetJobs(role string) (*aurora.Response, *aurora.GetJobsRe return r.readonlyClient.GetJobs(role) }) - if resp.GetResult_() != nil { - result = resp.GetResult_().GetJobsResult_ - } - if retryErr != nil { return nil, result, errors.Wrap(retryErr, "Error getting Jobs from Aurora Scheduler") } + if resp.GetResult_() != nil { + result = resp.GetResult_().GetJobsResult_ + } + return resp, result, nil } @@ -635,7 +635,7 @@ func (r *realisClient) CreateService(auroraJob Job, settings *aurora.JobUpdateSe return resp, nil, errors.Wrap(err, "unable to create service") } - if resp != nil && resp.GetResult_() != nil { + if resp.GetResult_() != nil { return resp, resp.GetResult_().GetStartJobUpdateResult_(), nil } @@ -879,7 +879,7 @@ func (r *realisClient) GetPendingReason(query *aurora.TaskQuery) (pendingReasons var result map[*aurora.PendingReason]bool - if resp != nil && resp.GetResult_() != nil { + if resp.GetResult_() != nil { result = resp.GetResult_().GetGetPendingReasonResult_().GetReasons() } for reason := range result { @@ -999,14 +999,14 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr return r.adminClient.DrainHosts(drainList) }) - if resp != nil && resp.GetResult_() != nil { - result = resp.GetResult_().GetDrainHostsResult_() - } - if retryErr != nil { return resp, result, errors.Wrap(retryErr, "Unable to recover connection") } + if resp.GetResult_() != nil { + result = resp.GetResult_().GetDrainHostsResult_() + } + return resp, result, nil } @@ -1030,14 +1030,14 @@ func (r *realisClient) StartMaintenance(hosts ...string) (*aurora.Response, *aur return r.adminClient.StartMaintenance(hostList) }) - if resp.GetResult_() != nil { - result = resp.GetResult_().GetStartMaintenanceResult_() - } - if retryErr != nil { return resp, result, errors.Wrap(retryErr, "Unable to recover connection") } + if resp.GetResult_() != nil { + result = resp.GetResult_().GetStartMaintenanceResult_() + } + return resp, result, nil } @@ -1061,14 +1061,14 @@ func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *auror return r.adminClient.EndMaintenance(hostList) }) - if resp.GetResult_() != nil { - result = resp.GetResult_().GetEndMaintenanceResult_() - } - if retryErr != nil { return resp, result, errors.Wrap(retryErr, "Unable to recover connection") } + if resp.GetResult_() != nil { + result = resp.GetResult_().GetEndMaintenanceResult_() + } + return resp, result, nil } @@ -1094,14 +1094,14 @@ func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *au return r.adminClient.MaintenanceStatus(hostList) }) - if resp.GetResult_() != nil { - result = resp.GetResult_().GetMaintenanceStatusResult_() - } - if retryErr != nil { return resp, result, errors.Wrap(retryErr, "Unable to recover connection") } + if resp.GetResult_() != nil { + result = resp.GetResult_().GetMaintenanceStatusResult_() + } + return resp, result, nil } diff --git a/retry.go b/retry.go index 90cf55e..36740d9 100644 --- a/retry.go +++ b/retry.go @@ -16,11 +16,13 @@ package realis import ( "math/rand" + "net/url" "time" "github.com/paypal/gorealis/gen-go/apache/aurora" "github.com/paypal/gorealis/response" "github.com/pkg/errors" + "github.com/rdelval/thrift/lib/go/thrift" ) type Backoff struct { @@ -158,6 +160,19 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro // Print out the error to the user r.logger.Printf("Client Error: %v\n", clientErr) + // Determine if error is a temporary URL error by going up the stack + e, ok := clientErr.(thrift.TTransportException) + if ok { + r.logger.DebugPrint("Encountered a transport exception") + + e, ok := e.Err().(*url.Error) + if ok { + if !IsTemporary(e) { + return nil, errors.Wrap(clientErr, "Non-temporary connection error") + } + } + } + // In the future, reestablish connection should be able to check if it is actually possible // to make a thrift call to Aurora. For now, a reconnect should always lead to a retry. r.ReestablishConn()