Retry temporary errors by default (#107)
* Adding Aurora URL validator in order to handle scenarios where incomplete information is passed to the client. The client will do its best to guess the missing information such as protocol and port. * Upgraded to testify 1.3.0. * Added configuration to fail on a non-temporary error. This is reverting to the original behavior of the retry mechanism. However, this allows the user to opt to fail in a non-temporary error.
This commit is contained in:
parent
4ffb509939
commit
6dc4bf93b9
37 changed files with 2795 additions and 1009 deletions
129
realis.go
129
realis.go
|
@ -16,6 +16,7 @@
|
|||
package realis
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/base64"
|
||||
|
@ -31,19 +32,23 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/apache/thrift/lib/go/thrift"
|
||||
"github.com/pkg/errors"
|
||||
|
||||
"github.com/paypal/gorealis/gen-go/apache/aurora"
|
||||
"github.com/paypal/gorealis/response"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
const VERSION = "1.21.0"
|
||||
const VERSION = "1.21.1"
|
||||
|
||||
// TODO(rdelvalle): Move documentation to interface in order to make godoc look better/more accessible
|
||||
// TODO(rdelvalle): Move documentation to interface in order to make godoc look better accessible
|
||||
// Or get rid of the interface
|
||||
type Realis interface {
|
||||
AbortJobUpdate(updateKey aurora.JobUpdateKey, message string) (*aurora.Response, error)
|
||||
AddInstances(instKey aurora.InstanceKey, count int32) (*aurora.Response, error)
|
||||
CreateJob(auroraJob Job) (*aurora.Response, error)
|
||||
CreateService(auroraJob Job, settings *aurora.JobUpdateSettings) (*aurora.Response, *aurora.StartJobUpdateResult_, error)
|
||||
CreateService(
|
||||
auroraJob Job,
|
||||
settings *aurora.JobUpdateSettings) (*aurora.Response, *aurora.StartJobUpdateResult_, error)
|
||||
DescheduleCronJob(key *aurora.JobKey) (*aurora.Response, error)
|
||||
FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.TaskConfig, error)
|
||||
GetInstanceIds(key *aurora.JobKey, states []aurora.ScheduleStatus) ([]int32, error)
|
||||
|
@ -108,13 +113,14 @@ type RealisConfig struct {
|
|||
transport thrift.TTransport
|
||||
protoFactory thrift.TProtocolFactory
|
||||
logger *LevelLogger
|
||||
InsecureSkipVerify bool
|
||||
insecureSkipVerify bool
|
||||
certspath string
|
||||
clientKey, clientCert string
|
||||
options []ClientOption
|
||||
debug bool
|
||||
trace bool
|
||||
zkOptions []ZKOpt
|
||||
failOnPermanentErrors bool
|
||||
}
|
||||
|
||||
var defaultBackoff = Backoff{
|
||||
|
@ -124,11 +130,10 @@ var defaultBackoff = Backoff{
|
|||
Jitter: 0.1,
|
||||
}
|
||||
|
||||
const APIPath = "/api"
|
||||
|
||||
// ClientOption - An alias for a function that modifies the realis config object
|
||||
type ClientOption func(*RealisConfig)
|
||||
|
||||
//Config sets for options in RealisConfig.
|
||||
// BasicAuth - Set authentication used against Apache Shiro in the Aurora scheduler
|
||||
func BasicAuth(username, password string) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.username = username
|
||||
|
@ -136,26 +141,29 @@ func BasicAuth(username, password string) ClientOption {
|
|||
}
|
||||
}
|
||||
|
||||
// SchedulerUrl - Set the immediate location of the current Aurora scheduler leader
|
||||
func SchedulerUrl(url string) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.url = url
|
||||
}
|
||||
}
|
||||
|
||||
// TimeoutMS - Set the connection timeout for an HTTP post request in Miliseconds
|
||||
func TimeoutMS(timeout int) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.timeoutms = timeout
|
||||
}
|
||||
}
|
||||
|
||||
// ZKCluster - Set a clusters.json provided cluster configuration to the client
|
||||
func ZKCluster(cluster *Cluster) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.cluster = cluster
|
||||
}
|
||||
}
|
||||
|
||||
// ZKUrl - Set the direct location of a Zookeeper node on which the Aurora leader registers itself
|
||||
func ZKUrl(url string) ClientOption {
|
||||
|
||||
opts := []ZKOpt{ZKEndpoints(strings.Split(url, ",")...), ZKPath("/aurora/scheduler")}
|
||||
|
||||
return func(config *RealisConfig) {
|
||||
|
@ -167,6 +175,7 @@ func ZKUrl(url string) ClientOption {
|
|||
}
|
||||
}
|
||||
|
||||
// Retries - Configure the retry mechanism for the client
|
||||
func Retries(backoff Backoff) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.backoff = backoff
|
||||
|
@ -191,9 +200,9 @@ func BackOff(b Backoff) ClientOption {
|
|||
}
|
||||
}
|
||||
|
||||
func InsecureSkipVerify(InsecureSkipVerify bool) ClientOption {
|
||||
func InsecureSkipVerify(insecureSkipVerify bool) ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.InsecureSkipVerify = InsecureSkipVerify
|
||||
config.insecureSkipVerify = insecureSkipVerify
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -238,12 +247,24 @@ func Trace() ClientOption {
|
|||
}
|
||||
}
|
||||
|
||||
// FailOnPermanentErrors - If the client encounters a connection error the standard library
|
||||
// considers permanent, stop retrying and return an error to the user.
|
||||
func FailOnPermanentErrors() ClientOption {
|
||||
return func(config *RealisConfig) {
|
||||
config.failOnPermanentErrors = true
|
||||
}
|
||||
}
|
||||
|
||||
func newTJSONTransport(url string, timeout int, config *RealisConfig) (thrift.TTransport, error) {
|
||||
trans, err := defaultTTransport(url, timeout, config)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to create transport")
|
||||
}
|
||||
httpTrans := (trans).(*thrift.THttpClient)
|
||||
httpTrans, ok := (trans).(*thrift.THttpClient)
|
||||
if !ok {
|
||||
return nil, errors.Wrap(err, "transport does not contain a thrift client")
|
||||
}
|
||||
|
||||
httpTrans.SetHeader("Content-Type", "application/x-thrift")
|
||||
httpTrans.SetHeader("User-Agent", "gorealis v"+VERSION)
|
||||
return trans, err
|
||||
|
@ -254,7 +275,11 @@ func newTBinTransport(url string, timeout int, config *RealisConfig) (thrift.TTr
|
|||
if err != nil {
|
||||
return nil, errors.Wrap(err, "unable to create transport")
|
||||
}
|
||||
httpTrans := (trans).(*thrift.THttpClient)
|
||||
httpTrans, ok := (trans).(*thrift.THttpClient)
|
||||
if !ok {
|
||||
return nil, errors.Wrap(err, "transport does not contain a thrift client")
|
||||
}
|
||||
|
||||
httpTrans.DelHeader("Content-Type") // Workaround for using thrift HttpPostClient
|
||||
httpTrans.SetHeader("Accept", "application/vnd.apache.thrift.binary")
|
||||
httpTrans.SetHeader("Content-Type", "application/vnd.apache.thrift.binary")
|
||||
|
@ -328,7 +353,7 @@ func NewRealisClient(options ...ClientOption) (Realis, error) {
|
|||
url, err = LeaderFromZK(*config.cluster)
|
||||
// If ZK is configured, throw an error if the leader is unable to be determined
|
||||
if err != nil {
|
||||
return nil, NewTemporaryError(errors.Wrap(err, "unable to use zk to get leader "))
|
||||
return nil, NewTemporaryError(errors.Wrap(err, "unable to use zk to get leader"))
|
||||
}
|
||||
config.logger.Println("Scheduler URL from ZK: ", url)
|
||||
} else if config.url != "" {
|
||||
|
@ -338,6 +363,13 @@ func NewRealisClient(options ...ClientOption) (Realis, error) {
|
|||
return nil, errors.New("incomplete Options -- url, cluster.json, or Zookeeper address required")
|
||||
}
|
||||
|
||||
config.logger.Println("Addresss obtained: ", url)
|
||||
url, err = validateAuroraURL(url)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "invalid Aurora url")
|
||||
}
|
||||
config.logger.Println("Corrected address: ", url)
|
||||
|
||||
if config.jsonTransport {
|
||||
trans, err := newTJSONTransport(url, config.timeoutms, config)
|
||||
if err != nil {
|
||||
|
@ -359,7 +391,10 @@ func NewRealisClient(options ...ClientOption) (Realis, error) {
|
|||
|
||||
// Adding Basic Authentication.
|
||||
if config.username != "" && config.password != "" {
|
||||
httpTrans := (config.transport).(*thrift.THttpClient)
|
||||
httpTrans, ok := (config.transport).(*thrift.THttpClient)
|
||||
if !ok {
|
||||
return nil, errors.New("transport provided does not contain an THttpClient")
|
||||
}
|
||||
httpTrans.SetHeader("Authorization", "Basic "+basicAuth(config.username, config.password))
|
||||
}
|
||||
|
||||
|
@ -405,7 +440,7 @@ func GetCerts(certPath string) (*x509.CertPool, error) {
|
|||
func defaultTTransport(url string, timeoutMs int, config *RealisConfig) (thrift.TTransport, error) {
|
||||
var transport http.Transport
|
||||
if config != nil {
|
||||
tlsConfig := &tls.Config{InsecureSkipVerify: config.InsecureSkipVerify}
|
||||
tlsConfig := &tls.Config{InsecureSkipVerify: config.insecureSkipVerify}
|
||||
|
||||
if config.certspath != "" {
|
||||
rootCAs, err := GetCerts(config.certspath)
|
||||
|
@ -433,11 +468,13 @@ func defaultTTransport(url string, timeoutMs int, config *RealisConfig) (thrift.
|
|||
}
|
||||
|
||||
trans, err := thrift.NewTHttpClientWithOptions(
|
||||
url+APIPath,
|
||||
url,
|
||||
thrift.THttpClientOptions{
|
||||
Client: &http.Client{
|
||||
Timeout: time.Millisecond * time.Duration(timeoutMs),
|
||||
Transport: &transport}})
|
||||
Transport: &transport,
|
||||
},
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "Error creating transport")
|
||||
|
@ -504,7 +541,7 @@ func (r *realisClient) GetInstanceIds(key *aurora.JobKey, states []aurora.Schedu
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksWithoutConfigs(nil, taskQ)
|
||||
return r.client.GetTasksWithoutConfigs(context.TODO(), taskQ)
|
||||
})
|
||||
|
||||
// If we encountered an error we couldn't recover from by retrying, return an error to the user
|
||||
|
@ -529,7 +566,7 @@ func (r *realisClient) GetJobUpdateSummaries(jobUpdateQuery *aurora.JobUpdateQue
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.readonlyClient.GetJobUpdateSummaries(nil, jobUpdateQuery)
|
||||
return r.readonlyClient.GetJobUpdateSummaries(context.TODO(), jobUpdateQuery)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -546,7 +583,7 @@ func (r *realisClient) GetJobs(role string) (*aurora.Response, *aurora.GetJobsRe
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.readonlyClient.GetJobs(nil, role)
|
||||
return r.readonlyClient.GetJobs(context.TODO(), role)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -567,7 +604,7 @@ func (r *realisClient) KillInstances(key *aurora.JobKey, instances ...int32) (*a
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.KillTasks(nil, key, instances, "")
|
||||
return r.client.KillTasks(context.TODO(), key, instances, "")
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -589,7 +626,7 @@ func (r *realisClient) KillJob(key *aurora.JobKey) (*aurora.Response, error) {
|
|||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
// Giving the KillTasks thrift call an empty set tells the Aurora scheduler to kill all active shards
|
||||
return r.client.KillTasks(nil, key, nil, "")
|
||||
return r.client.KillTasks(context.TODO(), key, nil, "")
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -609,7 +646,7 @@ func (r *realisClient) CreateJob(auroraJob Job) (*aurora.Response, error) {
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.CreateJob(nil, auroraJob.JobConfig())
|
||||
return r.client.CreateJob(context.TODO(), auroraJob.JobConfig())
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -619,7 +656,9 @@ func (r *realisClient) CreateJob(auroraJob Job) (*aurora.Response, error) {
|
|||
}
|
||||
|
||||
// This API uses an update thrift call to create the services giving a few more robust features.
|
||||
func (r *realisClient) CreateService(auroraJob Job, settings *aurora.JobUpdateSettings) (*aurora.Response, *aurora.StartJobUpdateResult_, error) {
|
||||
func (r *realisClient) CreateService(
|
||||
auroraJob Job,
|
||||
settings *aurora.JobUpdateSettings) (*aurora.Response, *aurora.StartJobUpdateResult_, error) {
|
||||
// Create a new job update object and ship it to the StartJobUpdate api
|
||||
update := NewUpdateJob(auroraJob.TaskConfig(), settings)
|
||||
update.InstanceCount(auroraJob.GetInstanceCount())
|
||||
|
@ -646,7 +685,7 @@ func (r *realisClient) ScheduleCronJob(auroraJob Job) (*aurora.Response, error)
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.ScheduleCronJob(nil, auroraJob.JobConfig())
|
||||
return r.client.ScheduleCronJob(context.TODO(), auroraJob.JobConfig())
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -662,7 +701,7 @@ func (r *realisClient) DescheduleCronJob(key *aurora.JobKey) (*aurora.Response,
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.DescheduleCronJob(nil, key)
|
||||
return r.client.DescheduleCronJob(context.TODO(), key)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -680,7 +719,7 @@ func (r *realisClient) StartCronJob(key *aurora.JobKey) (*aurora.Response, error
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.StartCronJob(nil, key)
|
||||
return r.client.StartCronJob(context.TODO(), key)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -697,7 +736,7 @@ func (r *realisClient) RestartInstances(key *aurora.JobKey, instances ...int32)
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.RestartShards(nil, key, instances)
|
||||
return r.client.RestartShards(context.TODO(), key, instances)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -720,7 +759,7 @@ func (r *realisClient) RestartJob(key *aurora.JobKey) (*aurora.Response, error)
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.RestartShards(nil, key, instanceIds)
|
||||
return r.client.RestartShards(context.TODO(), key, instanceIds)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -741,7 +780,7 @@ func (r *realisClient) StartJobUpdate(updateJob *UpdateJob, message string) (*au
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
true,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.StartJobUpdate(nil, updateJob.req, message)
|
||||
return r.client.StartJobUpdate(context.TODO(), updateJob.req, message)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -765,7 +804,7 @@ func (r *realisClient) AbortJobUpdate(updateKey aurora.JobUpdateKey, message str
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.AbortJobUpdate(nil, &updateKey, message)
|
||||
return r.client.AbortJobUpdate(context.TODO(), &updateKey, message)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -774,7 +813,11 @@ func (r *realisClient) AbortJobUpdate(updateKey aurora.JobUpdateKey, message str
|
|||
|
||||
// Make this call synchronous by blocking until it job has successfully transitioned to aborted
|
||||
m := Monitor{Client: r}
|
||||
_, err := m.JobUpdateStatus(updateKey, map[aurora.JobUpdateStatus]bool{aurora.JobUpdateStatus_ABORTED: true}, time.Second*5, time.Minute)
|
||||
_, err := m.JobUpdateStatus(
|
||||
updateKey,
|
||||
map[aurora.JobUpdateStatus]bool{aurora.JobUpdateStatus_ABORTED: true},
|
||||
time.Second*5,
|
||||
time.Minute)
|
||||
|
||||
return resp, err
|
||||
}
|
||||
|
@ -787,7 +830,7 @@ func (r *realisClient) PauseJobUpdate(updateKey *aurora.JobUpdateKey, message st
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.PauseJobUpdate(nil, updateKey, message)
|
||||
return r.client.PauseJobUpdate(context.TODO(), updateKey, message)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -805,7 +848,7 @@ func (r *realisClient) ResumeJobUpdate(updateKey *aurora.JobUpdateKey, message s
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.ResumeJobUpdate(nil, updateKey, message)
|
||||
return r.client.ResumeJobUpdate(context.TODO(), updateKey, message)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -823,7 +866,7 @@ func (r *realisClient) PulseJobUpdate(updateKey *aurora.JobUpdateKey) (*aurora.R
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.PulseJobUpdate(nil, updateKey)
|
||||
return r.client.PulseJobUpdate(context.TODO(), updateKey)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -842,7 +885,7 @@ func (r *realisClient) AddInstances(instKey aurora.InstanceKey, count int32) (*a
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.AddInstances(nil, &instKey, count)
|
||||
return r.client.AddInstances(context.TODO(), &instKey, count)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -881,7 +924,7 @@ func (r *realisClient) GetTaskStatus(query *aurora.TaskQuery) ([]*aurora.Schedul
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksStatus(nil, query)
|
||||
return r.client.GetTasksStatus(context.TODO(), query)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -899,7 +942,7 @@ func (r *realisClient) GetPendingReason(query *aurora.TaskQuery) ([]*aurora.Pend
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetPendingReason(nil, query)
|
||||
return r.client.GetPendingReason(context.TODO(), query)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -923,7 +966,7 @@ func (r *realisClient) GetTasksWithoutConfigs(query *aurora.TaskQuery) ([]*auror
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksWithoutConfigs(nil, query)
|
||||
return r.client.GetTasksWithoutConfigs(context.TODO(), query)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -949,7 +992,7 @@ func (r *realisClient) FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.Task
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksStatus(nil, taskQ)
|
||||
return r.client.GetTasksStatus(context.TODO(), taskQ)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -977,7 +1020,7 @@ func (r *realisClient) JobUpdateDetails(updateQuery aurora.JobUpdateQuery) (*aur
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.GetJobUpdateDetails(nil, &updateQuery)
|
||||
return r.client.GetJobUpdateDetails(context.TODO(), &updateQuery)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
@ -994,7 +1037,7 @@ func (r *realisClient) RollbackJobUpdate(key aurora.JobUpdateKey, message string
|
|||
resp, retryErr := r.thriftCallWithRetries(
|
||||
false,
|
||||
func() (*aurora.Response, error) {
|
||||
return r.client.RollbackJobUpdate(nil, &key, message)
|
||||
return r.client.RollbackJobUpdate(context.TODO(), &key, message)
|
||||
})
|
||||
|
||||
if retryErr != nil {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue