Detecting if the transport error was not temporary in which case we stop retrying. Changed bug where get results was being called before we checked for an error.

This commit is contained in:
Renan DelValle 2018-10-29 15:03:27 -07:00
parent 6762c1784b
commit dabd7418a9
No known key found for this signature in database
GPG key ID: C240AD6D6F443EC9
2 changed files with 37 additions and 22 deletions

View file

@ -555,14 +555,14 @@ func (r *realisClient) GetJobs(role string) (*aurora.Response, *aurora.GetJobsRe
return r.readonlyClient.GetJobs(role)
})
if resp.GetResult_() != nil {
result = resp.GetResult_().GetJobsResult_
}
if retryErr != nil {
return nil, result, errors.Wrap(retryErr, "Error getting Jobs from Aurora Scheduler")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetJobsResult_
}
return resp, result, nil
}
@ -635,7 +635,7 @@ func (r *realisClient) CreateService(auroraJob Job, settings *aurora.JobUpdateSe
return resp, nil, errors.Wrap(err, "unable to create service")
}
if resp != nil && resp.GetResult_() != nil {
if resp.GetResult_() != nil {
return resp, resp.GetResult_().GetStartJobUpdateResult_(), nil
}
@ -879,7 +879,7 @@ func (r *realisClient) GetPendingReason(query *aurora.TaskQuery) (pendingReasons
var result map[*aurora.PendingReason]bool
if resp != nil && resp.GetResult_() != nil {
if resp.GetResult_() != nil {
result = resp.GetResult_().GetGetPendingReasonResult_().GetReasons()
}
for reason := range result {
@ -999,14 +999,14 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
return r.adminClient.DrainHosts(drainList)
})
if resp != nil && resp.GetResult_() != nil {
result = resp.GetResult_().GetDrainHostsResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetDrainHostsResult_()
}
return resp, result, nil
}
@ -1030,14 +1030,14 @@ func (r *realisClient) StartMaintenance(hosts ...string) (*aurora.Response, *aur
return r.adminClient.StartMaintenance(hostList)
})
if resp.GetResult_() != nil {
result = resp.GetResult_().GetStartMaintenanceResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetStartMaintenanceResult_()
}
return resp, result, nil
}
@ -1061,14 +1061,14 @@ func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *auror
return r.adminClient.EndMaintenance(hostList)
})
if resp.GetResult_() != nil {
result = resp.GetResult_().GetEndMaintenanceResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetEndMaintenanceResult_()
}
return resp, result, nil
}
@ -1094,14 +1094,14 @@ func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *au
return r.adminClient.MaintenanceStatus(hostList)
})
if resp.GetResult_() != nil {
result = resp.GetResult_().GetMaintenanceStatusResult_()
}
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetMaintenanceStatusResult_()
}
return resp, result, nil
}

View file

@ -16,11 +16,13 @@ package realis
import (
"math/rand"
"net/url"
"time"
"github.com/paypal/gorealis/gen-go/apache/aurora"
"github.com/paypal/gorealis/response"
"github.com/pkg/errors"
"github.com/rdelval/thrift/lib/go/thrift"
)
type Backoff struct {
@ -158,6 +160,19 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
// Print out the error to the user
r.logger.Printf("Client Error: %v\n", clientErr)
// Determine if error is a temporary URL error by going up the stack
e, ok := clientErr.(thrift.TTransportException)
if ok {
r.logger.DebugPrint("Encountered a transport exception")
e, ok := e.Err().(*url.Error)
if ok {
if !IsTemporary(e) {
return nil, errors.Wrap(clientErr, "Non-temporary connection error")
}
}
}
// In the future, reestablish connection should be able to check if it is actually possible
// to make a thrift call to Aurora. For now, a reconnect should always lead to a retry.
r.ReestablishConn()