Simplifying retry mechanism for Thrift Calls (#56)

* Deleting permament error as it doesn't make sense. Just return a plain old error and that will be considered permanent. * Removing double closure at as it's unmaintainable and can be error prone. Separated back offs into a generic one and a thrift call specific one. * ZK leader finder now returns a temporary error instead of constantly no leader found and quitting. It could be that the leader info is being propagated so it's worth trying another time. * Adding more logging to the retry. * Wrapping lock and unlock in an anonymous function so that we can use defer on unlock such that it is called in the case of a panic.
2018-02-15 15:16:39 -08:00 · 2018-02-15 15:16:39 -08:00 · a43dc81ea8
commit a43dc81ea8
parent 64948c3712
4 changed files with 166 additions and 380 deletions
--- a/retry.go
+++ b/retry.go
@ -21,9 +21,18 @@ import (

 	"math/rand"

+	"github.com/paypal/gorealis/gen-go/apache/aurora"
+	"github.com/paypal/gorealis/response"
 	"github.com/pkg/errors"
 )

+type Backoff struct {
+	Duration time.Duration // the base duration
+	Factor   float64       // Duration is multipled by factor each iteration
+	Jitter   float64       // The amount of jitter applied each iteration
+	Steps    int           // Exit with error after this many steps
+}
+
 // Jitter returns a time.Duration between duration and duration + maxFactor *
 // duration.
 //
@ -89,3 +98,88 @@ func ExponentialBackoff(backoff Backoff, condition ConditionFunc) error {
 		return NewTimeoutError(errors.New("Timed out while retrying"))
 	}
 }
+
+type auroraThriftCall func() (resp *aurora.Response, err error)
+
+// Duplicates the functionality of ExponentialBackoff but is specifically targeted towards ThriftCalls.
+func (r *realisClient) ThriftCallWithRetries(thriftCall auroraThriftCall) (*aurora.Response, error) {
+	var resp *aurora.Response
+	var clientErr error
+
+	backoff := r.config.backoff
+	duration := backoff.Duration
+
+	for i := 0; i < backoff.Steps; i++ {
+
+		// If this isn't our first try, backoff before the next try.
+		if i != 0 {
+			adjusted := duration
+			if backoff.Jitter > 0.0 {
+				adjusted = Jitter(duration, backoff.Jitter)
+			}
+
+			r.logger.Printf("An error occurred during thrift call, backing off for %v before retrying\n", adjusted)
+
+			time.Sleep(adjusted)
+			duration = time.Duration(float64(duration) * backoff.Factor)
+		}
+
+		// Only allow one go-routine make use or modify the thrift client connection.
+		// Placing this in an anonymous function in order to create a new, short-lived stack allowing unlock
+		// to be run in case of a panic inside of thriftCall.
+		func() {
+			r.lock.Lock()
+			defer r.lock.Unlock()
+			resp, clientErr = thriftCall()
+		}()
+
+		// Check if our thrift call is returning an error. This is a retriable event as we don't know
+		// if it was caused by network issues.
+		if clientErr != nil {
+			r.ReestablishConn()
+
+			// In the future, reestablish connection should be able to check if it is actually possible
+			// to make a thrift call to Aurora. For now, a reconnect should always lead to a retry.
+			continue
+		}
+
+		// If there was no client error, but the response is nil, something went wrong.
+		// Ideally, we'll never encounter this but we're placing a safeguard here.
+		if resp == nil {
+			return nil, errors.New("Response from aurora is nil")
+		}
+
+		// Check Response Code from thrift and make a decision to continue retrying or not.
+		switch responseCode := resp.GetResponseCode(); responseCode {
+
+		// If the thrift call succeeded, stop retrying
+		case aurora.ResponseCode_OK:
+			return resp, nil
+
+		// If the response code is transient, continue retrying
+		case aurora.ResponseCode_ERROR_TRANSIENT:
+			r.logger.Println("Aurora replied with Transient error code, retrying")
+			continue
+
+		// Failure scenarios, these indicate a bad payload or a bad config. Stop retrying.
+		case aurora.ResponseCode_INVALID_REQUEST:
+		case aurora.ResponseCode_ERROR:
+		case aurora.ResponseCode_AUTH_FAILED:
+		case aurora.ResponseCode_JOB_UPDATING_ERROR:
+			return nil, errors.New(response.CombineMessage(resp))
+
+		// The only case that should fall down to here is a WARNING response code.
+		// It is currently not used as a response in the scheduler so it is unknown how to handle it.
+		default:
+			return nil, errors.Errorf("unhandled response code from Aurora %v", responseCode.String())
+		}
+
+	}
+
+	// Provide more information to the user wherever possible.
+	if clientErr != nil {
+		return nil, NewTimeoutError(errors.Wrap(clientErr, "Timed out while retrying, including latest error"))
+	} else {
+		return nil, NewTimeoutError(errors.New("Timed out while retrying"))
+	}
+}