Adding a debug logger that is turned off by default.
Info logger is enabled by default but prints out less information.
This commit is contained in:
parent
7152f568fe
commit
69442d5957
5 changed files with 129 additions and 60 deletions
78
retry.go
78
retry.go
|
@ -134,7 +134,7 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
|
|||
adjusted = Jitter(duration, backoff.Jitter)
|
||||
}
|
||||
|
||||
r.logger.Printf("A retriable error occurred during thrift call, backing off for %v before retry %v\n", adjusted, curStep+1)
|
||||
r.infoLogger.Printf("A retriable error occurred during thrift call, backing off for %v before retry %v\n", adjusted, curStep)
|
||||
|
||||
time.Sleep(adjusted)
|
||||
duration = time.Duration(float64(duration) * backoff.Factor)
|
||||
|
@ -146,8 +146,12 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
|
|||
func() {
|
||||
r.lock.Lock()
|
||||
defer r.lock.Unlock()
|
||||
r.logger.Println("Beginning Aurora Thrift Call")
|
||||
|
||||
r.debugLogger.Println("Beginning Aurora Thrift Call")
|
||||
|
||||
resp, clientErr = thriftCall()
|
||||
|
||||
r.debugLogger.Printf("Aurora Thrift Call ended resp: %v clientErr: %v\n", resp, clientErr)
|
||||
}()
|
||||
|
||||
// Check if our thrift call is returning an error. This is a retriable event as we don't know
|
||||
|
@ -155,54 +159,58 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
|
|||
if clientErr != nil {
|
||||
|
||||
// Print out the error to the user
|
||||
r.logger.Println(clientErr)
|
||||
|
||||
r.ReestablishConn()
|
||||
r.infoLogger.Printf("Client Error: %v\n", clientErr)
|
||||
|
||||
// In the future, reestablish connection should be able to check if it is actually possible
|
||||
// to make a thrift call to Aurora. For now, a reconnect should always lead to a retry.
|
||||
continue
|
||||
}
|
||||
r.ReestablishConn()
|
||||
|
||||
// If there was no client error, but the response is nil, something went wrong.
|
||||
// Ideally, we'll never encounter this but we're placing a safeguard here.
|
||||
if resp == nil {
|
||||
return nil, errors.New("Response from aurora is nil")
|
||||
}
|
||||
} else {
|
||||
|
||||
// Log response code for debugging
|
||||
r.logger.Printf("Aurora replied with code: %v\n", resp.GetResponseCode().String())
|
||||
// If there was no client error, but the response is nil, something went wrong.
|
||||
// Ideally, we'll never encounter this but we're placing a safeguard here.
|
||||
if resp == nil {
|
||||
return nil, errors.New("Response from aurora is nil")
|
||||
}
|
||||
|
||||
// Check Response Code from thrift and make a decision to continue retrying or not.
|
||||
switch responseCode := resp.GetResponseCode(); responseCode {
|
||||
// Log response code for debugging
|
||||
r.debugLogger.Printf("Aurora replied with code: %v\n", resp.GetResponseCode().String())
|
||||
|
||||
// If the thrift call succeeded, stop retrying
|
||||
case aurora.ResponseCode_OK:
|
||||
return resp, nil
|
||||
// Check Response Code from thrift and make a decision to continue retrying or not.
|
||||
switch responseCode := resp.GetResponseCode(); responseCode {
|
||||
|
||||
// If the response code is transient, continue retrying
|
||||
case aurora.ResponseCode_ERROR_TRANSIENT:
|
||||
r.logger.Println("Aurora replied with Transient error code, retrying")
|
||||
continue
|
||||
// If the thrift call succeeded, stop retrying
|
||||
case aurora.ResponseCode_OK:
|
||||
r.debugLogger.Println("OK reply from Aurora")
|
||||
return resp, nil
|
||||
|
||||
// Failure scenarios, these indicate a bad payload or a bad config. Stop retrying.
|
||||
case aurora.ResponseCode_INVALID_REQUEST:
|
||||
case aurora.ResponseCode_ERROR:
|
||||
case aurora.ResponseCode_AUTH_FAILED:
|
||||
case aurora.ResponseCode_JOB_UPDATING_ERROR:
|
||||
return nil, errors.New(response.CombineMessage(resp))
|
||||
// If the response code is transient, continue retrying
|
||||
case aurora.ResponseCode_ERROR_TRANSIENT:
|
||||
r.infoLogger.Println("Aurora replied with Transient error code, retrying")
|
||||
continue
|
||||
|
||||
// The only case that should fall down to here is a WARNING response code.
|
||||
// It is currently not used as a response in the scheduler so it is unknown how to handle it.
|
||||
default:
|
||||
r.logger.Println("unhandled response code received from Aurora")
|
||||
return nil, errors.Errorf("unhandled response code from Aurora %v", responseCode.String())
|
||||
// Failure scenarios, these indicate a bad payload or a bad config. Stop retrying.
|
||||
case aurora.ResponseCode_INVALID_REQUEST:
|
||||
case aurora.ResponseCode_ERROR:
|
||||
case aurora.ResponseCode_AUTH_FAILED:
|
||||
case aurora.ResponseCode_JOB_UPDATING_ERROR:
|
||||
r.debugLogger.Println("Terminal bad reply from Aurora, won't retry")
|
||||
return nil, errors.New(response.CombineMessage(resp))
|
||||
|
||||
// The only case that should fall down to here is a WARNING response code.
|
||||
// It is currently not used as a response in the scheduler so it is unknown how to handle it.
|
||||
default:
|
||||
r.debugLogger.Printf("unhandled response code %v received from Aurora\n", responseCode)
|
||||
return nil, errors.Errorf("unhandled response code from Aurora %v\n", responseCode.String())
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
r.debugLogger.Printf("it took %v retries to complete this operation\n", curStep)
|
||||
|
||||
if curStep > 1 {
|
||||
r.config.logger.Printf("retried this thrift call %d time(s)", curStep)
|
||||
r.config.infoLogger.Printf("retried this thrift call %d time(s)", curStep)
|
||||
}
|
||||
|
||||
// Provide more information to the user wherever possible.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue