Thread safety, misc fixes, and refactoring (#51)

* Changing incorrect license in some source files.

* Changing CreateService to mimic CreateJob by setting the batch size to the instance count.

* Changing Getcerts to GetCerts to match the style of the rest of the codebase.

* Overhauled error handling. Backoff now recognizes temporary errors and continues to retry if it finds one.

* Changed thrift function call wrapper to be more explicitly named and to perform more safety checks.

* Moved Jitter function from realis to retry.

* API code is now more uniform and follows a certain template.

* Lock added whenever a thrift call is made or when a modification is done to the connection. Note that calling ReestablishConn externally may result in some race conditions. We will move to make this function private in the near future.

* Added test for Realis session thread safety. Tested ScheduleStatus monitor. Tested monitor timing out.

* Returning nil whenever there is an error return so that there are no ambiguities.

* Using defer with unlock so that the lock is still released if a panic is invoked.
This commit is contained in:
Renan DelValle 2018-01-21 19:30:01 -08:00 committed by GitHub
parent b2ffb73183
commit a941bcb679
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 256 additions and 245 deletions

View file

@ -20,22 +20,26 @@ import (
"errors"
"time"
"github.com/paypal/gorealis/gen-go/apache/aurora"
"math/rand"
)
const (
ConnRefusedErr = "connection refused"
NoLeaderFoundErr = "No leader found"
)
var RetryConnErr = errors.New("error occured during with aurora retrying")
// Jitter returns a time.Duration between duration and duration + maxFactor *
// duration.
//
// This allows clients to avoid converging on periodic behavior. If maxFactor
// is 0.0, a suggested default value will be chosen.
func Jitter(duration time.Duration, maxFactor float64) time.Duration {
if maxFactor <= 0.0 {
maxFactor = 1.0
}
wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
return wait
}
// ConditionFunc returns true if the condition is satisfied, or an error
// if the loop should be aborted.
type ConditionFunc func() (done bool, err error)
type AuroraThriftCall func() (resp *aurora.Response, err error)
// Modified version of the Kubernetes exponential-backoff code.
// ExponentialBackoff repeats a condition check with exponential backoff.
//
@ -76,20 +80,3 @@ func ExponentialBackoff(backoff Backoff, condition ConditionFunc) error {
}
return NewTimeoutError(errors.New("Timed out while retrying"))
}
// CheckAndRetryConn function takes realis client and a trhift API function to call and returns response and error
// If Error from the APi call is Retry able . THe functions re establishes the connection with aurora by getting the latest aurora master from zookeeper.
// If Error is retyable return resp and RetryConnErr error.
func CheckAndRetryConn(r Realis, auroraCall AuroraThriftCall) (*aurora.Response, error) {
resp, cliErr := auroraCall()
// TODO: Return different error type based on the error that was returned by the API call
if cliErr != nil {
r.ReestablishConn()
return resp, NewPermamentError(RetryConnErr)
}
if resp != nil && resp.GetResponseCode() == aurora.ResponseCode_ERROR_TRANSIENT {
return resp, NewTemporaryError(errors.New("Aurora scheduler temporarily unavailable"))
}
return resp, nil
}