Thread safety, misc fixes, and refactoring (#51)
* Changing incorrect license in some source files. * Changing CreateService to mimic CreateJob by setting the batch size to the instance count. * Changing Getcerts to GetCerts to match the style of the rest of the codebase. * Overhauled error handling. Backoff now recognizes temporary errors and continues to retry if it finds one. * Changed thrift function call wrapper to be more explicitly named and to perform more safety checks. * Moved Jitter function from realis to retry. * API code is now more uniform and follows a certain template. * Lock added whenever a thrift call is made or when a modification is done to the connection. Note that calling ReestablishConn externally may result in some race conditions. We will move to make this function private in the near future. * Added test for Realis session thread safety. Tested ScheduleStatus monitor. Tested monitor timing out. * Returning nil whenever there is an error return so that there are no ambiguities. * Using defer with unlock so that the lock is still released if a panic is invoked.
This commit is contained in:
parent
b2ffb73183
commit
a941bcb679
6 changed files with 256 additions and 245 deletions
39
retry.go
39
retry.go
|
@ -20,22 +20,26 @@ import (
|
|||
"errors"
|
||||
"time"
|
||||
|
||||
"github.com/paypal/gorealis/gen-go/apache/aurora"
|
||||
"math/rand"
|
||||
)
|
||||
|
||||
const (
|
||||
ConnRefusedErr = "connection refused"
|
||||
NoLeaderFoundErr = "No leader found"
|
||||
)
|
||||
|
||||
var RetryConnErr = errors.New("error occured during with aurora retrying")
|
||||
// Jitter returns a time.Duration between duration and duration + maxFactor *
|
||||
// duration.
|
||||
//
|
||||
// This allows clients to avoid converging on periodic behavior. If maxFactor
|
||||
// is 0.0, a suggested default value will be chosen.
|
||||
func Jitter(duration time.Duration, maxFactor float64) time.Duration {
|
||||
if maxFactor <= 0.0 {
|
||||
maxFactor = 1.0
|
||||
}
|
||||
wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
|
||||
return wait
|
||||
}
|
||||
|
||||
// ConditionFunc returns true if the condition is satisfied, or an error
|
||||
// if the loop should be aborted.
|
||||
type ConditionFunc func() (done bool, err error)
|
||||
|
||||
type AuroraThriftCall func() (resp *aurora.Response, err error)
|
||||
|
||||
// Modified version of the Kubernetes exponential-backoff code.
|
||||
// ExponentialBackoff repeats a condition check with exponential backoff.
|
||||
//
|
||||
|
@ -76,20 +80,3 @@ func ExponentialBackoff(backoff Backoff, condition ConditionFunc) error {
|
|||
}
|
||||
return NewTimeoutError(errors.New("Timed out while retrying"))
|
||||
}
|
||||
|
||||
// CheckAndRetryConn function takes realis client and a trhift API function to call and returns response and error
|
||||
// If Error from the APi call is Retry able . THe functions re establishes the connection with aurora by getting the latest aurora master from zookeeper.
|
||||
// If Error is retyable return resp and RetryConnErr error.
|
||||
func CheckAndRetryConn(r Realis, auroraCall AuroraThriftCall) (*aurora.Response, error) {
|
||||
resp, cliErr := auroraCall()
|
||||
|
||||
// TODO: Return different error type based on the error that was returned by the API call
|
||||
if cliErr != nil {
|
||||
r.ReestablishConn()
|
||||
return resp, NewPermamentError(RetryConnErr)
|
||||
}
|
||||
if resp != nil && resp.GetResponseCode() == aurora.ResponseCode_ERROR_TRANSIENT {
|
||||
return resp, NewTemporaryError(errors.New("Aurora scheduler temporarily unavailable"))
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue