Adding more fine grained controls to retry mechanism. Retry mechanism may now be configured to not retry if an error is hit or to specifically stop retrying if a timeout error is encountered.
This commit is contained in:
parent
fe4a0dc06e
commit
55cf9bcb70
2 changed files with 102 additions and 75 deletions
35
retry.go
35
retry.go
|
@ -69,6 +69,7 @@ func ExponentialBackoff(backoff Backoff, logger Logger, condition ConditionFunc)
|
|||
var err error
|
||||
var ok bool
|
||||
var curStep int
|
||||
|
||||
duration := backoff.Duration
|
||||
|
||||
for curStep = 0; curStep < backoff.Steps; curStep++ {
|
||||
|
@ -80,7 +81,7 @@ func ExponentialBackoff(backoff Backoff, logger Logger, condition ConditionFunc)
|
|||
adjusted = Jitter(duration, backoff.Jitter)
|
||||
}
|
||||
|
||||
logger.Printf("A retriable error occurred during function call, backing off for %v before retrying\n", adjusted)
|
||||
logger.Printf("A retryable error occurred during function call, backing off for %v before retrying\n", adjusted)
|
||||
time.Sleep(adjusted)
|
||||
duration = time.Duration(float64(duration) * backoff.Factor)
|
||||
}
|
||||
|
@ -119,10 +120,11 @@ func ExponentialBackoff(backoff Backoff, logger Logger, condition ConditionFunc)
|
|||
type auroraThriftCall func() (resp *aurora.Response, err error)
|
||||
|
||||
// Duplicates the functionality of ExponentialBackoff but is specifically targeted towards ThriftCalls.
|
||||
func (c *Client) thriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Response, error) {
|
||||
func (c *Client) thriftCallWithRetries(returnOnTimeout bool, thriftCall auroraThriftCall) (*aurora.Response, error) {
|
||||
var resp *aurora.Response
|
||||
var clientErr error
|
||||
var curStep int
|
||||
var timeouts int
|
||||
|
||||
backoff := c.config.backoff
|
||||
duration := backoff.Duration
|
||||
|
@ -136,7 +138,7 @@ func (c *Client) thriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Res
|
|||
adjusted = Jitter(duration, backoff.Jitter)
|
||||
}
|
||||
|
||||
c.logger.Printf("A retriable error occurred during thrift call, backing off for %v before retry %v\n", adjusted, curStep)
|
||||
c.logger.Printf("A retryable error occurred during thrift call, backing off for %v before retry %v\n", adjusted, curStep)
|
||||
|
||||
time.Sleep(adjusted)
|
||||
duration = time.Duration(float64(duration) * backoff.Factor)
|
||||
|
@ -154,7 +156,7 @@ func (c *Client) thriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Res
|
|||
c.logger.TracePrintf("Aurora Thrift Call ended resp: %v clientErr: %v\n", resp, clientErr)
|
||||
}()
|
||||
|
||||
// Check if our thrift call is returning an error. This is a retriable event as we don't know
|
||||
// Check if our thrift call is returning an error. This is a retryable event as we don't know
|
||||
// if it was caused by network issues.
|
||||
if clientErr != nil {
|
||||
|
||||
|
@ -177,22 +179,37 @@ func (c *Client) thriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Res
|
|||
// EOF error occurs when the server closes the read buffer of the client. This is common
|
||||
// when the server is overloaded and should be retried. All other errors that are permanent
|
||||
// will not be retried.
|
||||
if e.Err != io.EOF && !e.Temporary() {
|
||||
return nil, errors.Wrap(clientErr, "Permanent connection error")
|
||||
if e.Err != io.EOF && !e.Temporary() && c.RealisConfig().failOnPermanentErrors {
|
||||
return nil, errors.Wrap(clientErr, "permanent connection error")
|
||||
}
|
||||
// Corner case where thrift payload was received by Aurora but connection timedout before Aurora was
|
||||
// able to reply. In this case we will return whatever response was received and a TimedOut behaving
|
||||
// error. Users can take special action on a timeout by using IsTimedout and reacting accordingly.
|
||||
if e.Timeout() {
|
||||
timeouts++
|
||||
c.logger.DebugPrintf(
|
||||
"Client closed connection (timedout) %d times before server responded,"+
|
||||
" consider increasing connection timeout",
|
||||
timeouts)
|
||||
if returnOnTimeout {
|
||||
return resp,
|
||||
newTimedoutError(errors.New("client connection closed before server answer"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In the future, reestablish connection should be able to check if it is actually possible
|
||||
// to make a thrift call to Aurora. For now, a reconnect should always lead to a retry.
|
||||
c.ReestablishConn()
|
||||
// Ignoring error due to the fact that an error should be retried regardless
|
||||
_ = c.ReestablishConn()
|
||||
|
||||
} else {
|
||||
|
||||
// If there was no client error, but the response is nil, something went wrong.
|
||||
// Ideally, we'll never encounter this but we're placing a safeguard here.
|
||||
if resp == nil {
|
||||
return nil, errors.New("Response from aurora is nil")
|
||||
return nil, errors.New("response from aurora is nil")
|
||||
}
|
||||
|
||||
// Check Response Code from thrift and make a decision to continue retrying or not.
|
||||
|
@ -219,7 +236,7 @@ func (c *Client) thriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Res
|
|||
// It is currently not used as a response in the scheduler so it is unknown how to handle it.
|
||||
default:
|
||||
c.logger.DebugPrintf("unhandled response code %v received from Aurora\n", responseCode)
|
||||
return nil, errors.Errorf("unhandled response code from Aurora %v\n", responseCode.String())
|
||||
return nil, errors.Errorf("unhandled response code from Aurora %v", responseCode.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue