Misc. bug fixes and addition of debug logging (#61)

* Fixing possible race condition when passing backoff around as a pointer.

* Adding a debug logger that is turned off by default. If debug is turned on, but a logger has not been assigned, a default logger that will print to STDOUT will be created.

* Making Mutex a pointer so that there's no chance it can accidentally be copied.

* Removing a leftover helper function from before we changed how we configured the client.

* Minor changes to demonstrate how a logger can be used in conjunction to debug mode in the sample client.
This commit is contained in:
Renan DelValle 2018-04-13 11:03:29 -07:00 committed by GitHub
parent c0d2969976
commit 4f5766b443
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 156 additions and 63 deletions

View file

@ -134,7 +134,7 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
adjusted = Jitter(duration, backoff.Jitter)
}
r.logger.Printf("A retriable error occurred during thrift call, backing off for %v before retrying\n", adjusted)
r.logger.Printf("A retriable error occurred during thrift call, backing off for %v before retry %v\n", adjusted, curStep)
time.Sleep(adjusted)
duration = time.Duration(float64(duration) * backoff.Factor)
@ -146,7 +146,10 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
func() {
r.lock.Lock()
defer r.lock.Unlock()
resp, clientErr = thriftCall()
r.logger.DebugPrintf("Aurora Thrift Call ended resp: %v clientErr: %v\n", resp, clientErr)
}()
// Check if our thrift call is returning an error. This is a retriable event as we don't know
@ -154,48 +157,52 @@ func (r *realisClient) thriftCallWithRetries(thriftCall auroraThriftCall) (*auro
if clientErr != nil {
// Print out the error to the user
r.logger.Println(clientErr)
r.ReestablishConn()
r.logger.Printf("Client Error: %v\n", clientErr)
// In the future, reestablish connection should be able to check if it is actually possible
// to make a thrift call to Aurora. For now, a reconnect should always lead to a retry.
continue
}
r.ReestablishConn()
// If there was no client error, but the response is nil, something went wrong.
// Ideally, we'll never encounter this but we're placing a safeguard here.
if resp == nil {
return nil, errors.New("Response from aurora is nil")
}
} else {
// Check Response Code from thrift and make a decision to continue retrying or not.
switch responseCode := resp.GetResponseCode(); responseCode {
// If there was no client error, but the response is nil, something went wrong.
// Ideally, we'll never encounter this but we're placing a safeguard here.
if resp == nil {
return nil, errors.New("Response from aurora is nil")
}
// If the thrift call succeeded, stop retrying
case aurora.ResponseCode_OK:
return resp, nil
// Check Response Code from thrift and make a decision to continue retrying or not.
switch responseCode := resp.GetResponseCode(); responseCode {
// If the response code is transient, continue retrying
case aurora.ResponseCode_ERROR_TRANSIENT:
r.logger.Println("Aurora replied with Transient error code, retrying")
continue
// If the thrift call succeeded, stop retrying
case aurora.ResponseCode_OK:
return resp, nil
// Failure scenarios, these indicate a bad payload or a bad config. Stop retrying.
case aurora.ResponseCode_INVALID_REQUEST:
case aurora.ResponseCode_ERROR:
case aurora.ResponseCode_AUTH_FAILED:
case aurora.ResponseCode_JOB_UPDATING_ERROR:
return nil, errors.New(response.CombineMessage(resp))
// If the response code is transient, continue retrying
case aurora.ResponseCode_ERROR_TRANSIENT:
r.logger.Println("Aurora replied with Transient error code, retrying")
continue
// The only case that should fall down to here is a WARNING response code.
// It is currently not used as a response in the scheduler so it is unknown how to handle it.
default:
return nil, errors.Errorf("unhandled response code from Aurora %v", responseCode.String())
// Failure scenarios, these indicate a bad payload or a bad config. Stop retrying.
case aurora.ResponseCode_INVALID_REQUEST:
case aurora.ResponseCode_ERROR:
case aurora.ResponseCode_AUTH_FAILED:
case aurora.ResponseCode_JOB_UPDATING_ERROR:
r.logger.Println("Terminal bad reply from Aurora, won't retry")
return nil, errors.New(response.CombineMessage(resp))
// The only case that should fall down to here is a WARNING response code.
// It is currently not used as a response in the scheduler so it is unknown how to handle it.
default:
r.logger.DebugPrintf("unhandled response code %v received from Aurora\n", responseCode)
return nil, errors.Errorf("unhandled response code from Aurora %v\n", responseCode.String())
}
}
}
r.logger.DebugPrintf("it took %v retries to complete this operation\n", curStep)
if curStep > 1 {
r.config.logger.Printf("retried this thrift call %d time(s)", curStep)
}